In [1]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
from spectralcluster import SpectralClusterer
from spectralcluster import RefinementOptions
from spectralcluster import ThresholdType
from spectralcluster import ICASSP2018_REFINEMENT_SEQUENCE
from resemblyzer.audio import sampling_rate
from pydub import AudioSegment
import numpy as np
import os
import joblib
import librosa
import pandas as pd

## Définition de la fonction générale

In [2]:
def count_speakers1(audio_path):
    
    def extract_speech(audio_path):
        segmentation = Segmenter(detect_gender=False)
        return segmentation(audio_path)
    
    def concatenate_segments(segmentation):
        t1 = [] #Temps de début de parole
        t2 = [] #Temps de fin de parole

        #Remplissage des tableaux précédents
        for i in range(len(segmentation)):
            #if (segmentation[i][0] == 'male') or (segmentation[i][0] == 'female'):
            if segmentation[i][0] == 'speech':
                t1.append(segmentation[i][1])
                t2.append(segmentation[i][2])

        #Importation du fichier .wav dont on veut les différents locuteurs
        newAudio = AudioSegment.from_wav(audio_path)
        #Génération de deux fichiers audio vides
        audio = AudioSegment.empty()

        # Génération fichier
        for i in range(len(t1)):
            audio += newAudio[t1[i]*1000:t2[i]*1000]
        audio.export(out_f = "cut_audio.wav", 
                            format = "wav")
        return "cut_audio.wav"
    
    def speaker_segmentation(audio):
        labelling = []
        if os.stat(audio).st_size > 100000 :
            wav_fpath = Path(audio)
            wav = preprocess_wav(wav_fpath)
            encoder = VoiceEncoder("cpu")
            _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

            # Pour déterminer les différents locuteurs/locutrices, on clusterise
            refinement_options = RefinementOptions(gaussian_blur_sigma=1,
                                                    p_percentile=0.90,
                                                    thresholding_soft_multiplier=0.01,
                                                    thresholding_type=ThresholdType.RowMax,
                                                    refinement_sequence=ICASSP2018_REFINEMENT_SEQUENCE)

            clusterer = SpectralClusterer(min_clusters=2,
                                          max_clusters=100,
                                          refinement_options=refinement_options)

            labels = clusterer.predict(cont_embeds)

            times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
            start_time = 0

            for i,time in enumerate(times):
                if i>0 and labels[i]!=labels[i-1]:
                    temp = [str(labels[i-1]),start_time,time]
                    labelling.append(tuple(temp))
                    start_time = time
                if i==len(times)-1:
                    temp = [str(labels[i]),start_time,time]
                    labelling.append(tuple(temp))
        else:
            labelling.append(-1)
        return labelling
    
    def extract_audios(labelling):
        audio = AudioSegment.from_wav(audio_path)
        L=[]
        for t in labelling:
            if not(t == -1):
                new_audio = AudioSegment.empty()
                new_audio = audio[t[1]*1000:t[2]*1000]
                out_f= "new_audio"+str([t[1],t[2]])+".wav"
                new_audio.export(out_f , format = "wav")
                L.append(out_f)
        return(L)
    
    def different_speakers(audio_path):
    
        ## Exécution partie 1 : 
        # Etape I : 
        segmentation = extract_speech(audio_path)
        # Etape II : 
        newAudio = concatenate_segments(segmentation)
        # Etape III :
        labelling = speaker_segmentation(newAudio)
        # Etape IV : 
        resultat = extract_audios(labelling)
            
        return [resultat, labelling]

    def get_char_son(fichier):
        signal, sr=librosa.load(fichier)
        S = np.abs(librosa.stft(signal))
        comps, acts = librosa.decompose.decompose(S, n_components=1)
        data=pd.DataFrame([np.ravel(comps).tolist()])
        data['ID']=fichier
        return(data.set_index('ID'))
    
    def predict_sound(tab):
        loaded_model = joblib.load('Reconnaissance Femme_Homme\my_model.pkl')
        if loaded_model.predict(tab)[0]==0:
            return('Homme')
        return('Femme')
    
    part1 = different_speakers(audio_path)
    print(part1)
    files = part1[0]
    labelling = part1[1]
    dict_speaker = {}
    dict_speaker_sex = {}
    for i in range(len(files)):
        tab = get_char_son(files[i])
        pred_sex = predict_sound(tab)
        try:
            dict_speaker[labelling[i]] = 1 + dict_speaker[labelling[i]]
        except:
            dict_speaker[labelling[i]] = 1
        dict_speaker_sex[labelling[i]] = pred_sex

    c_male = 0
    c_female = 0 
    for cle,valeur in dict_speaker_sex.items():
        if valeur == 'Homme':
            c_male = c_male + dict_speaker[cle]
        elif valeur == 'Femme':
            c_female = c_female + dict_speaker[cle]

    result = {'M':c_male , 'F':c_female}
    return result
    

In [3]:
count_speakers1("moviesoundclips.net_test/moviesoundclips.net/959.wav")

  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.08 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 2.66].wav', 'new_audio[2.66, 4.1].wav', 'new_audio[4.1, 5.42].wav', 'new_audio[5.42, 6.5].wav', 'new_audio[6.5, 7.76].wav', 'new_audio[7.76, 8.78].wav', 'new_audio[8.78, 9.8].wav'], [('4', 0, 2.0), ('6', 2.0, 2.66), ('3', 2.66, 4.1), ('2', 4.1, 5.42), ('7', 5.42, 6.5), ('1', 6.5, 7.76), ('0', 7.76, 8.78), ('5', 8.78, 9.8)]]




{'M': 8, 'F': 0}

## Liste des fichiers à tester

In [4]:
import os
res={}
L=os.listdir('moviesoundclips.net_test/moviesoundclips.net/')
print(L)

['100.wav', '1065.wav', '1156.wav', '1162.wav', '1167.wav', '1226.wav', '125.wav', '126.wav', '127.wav', '1310.wav', '1313.wav', '1327.wav', '137.wav', '15.wav', '1508.wav', '1532.wav', '1551.wav', '156.wav', '157.wav', '161.wav', '1666.wav', '1727.wav', '173.wav', '1730.wav', '1731.wav', '1754.wav', '1757.wav', '1758.wav', '1770.wav', '1777.wav', '1819.wav', '1826.wav', '1827.wav', '1832.wav', '1875.wav', '1892.wav', '1976.wav', '1990.wav', '1994.wav', '1996.wav', '2062.wav', '2071.wav', '2087.wav', '2097.wav', '2129.wav', '2134.wav', '2159.wav', '2163.wav', '2164.wav', '2191.wav', '2195.wav', '220.wav', '2213.wav', '2296.wav', '2317.wav', '2322.wav', '2375.wav', '2382.wav', '2389.wav', '2392.wav', '2395.wav', '240.wav', '2406.wav', '2428.wav', '2430.wav', '2453.wav', '2455.wav', '2458.wav', '2463.wav', '2470.wav', '2481.wav', '2538.wav', '2542.wav', '2546.wav', '2553.wav', '2569.wav', '2636.wav', '2667.wav', '2693.wav', '2696.wav', '2704.wav', '2724.wav', '2751.wav', '2753.wav', '280

# Test sur le jeu de données

In [5]:
res3={}
for i in range(len(L)):
    sep=(L[i].find('.'))
    num=L[i][:sep]
    try:
        res_prov=count_speakers1("moviesoundclips.net_test/moviesoundclips.net/"+L[i])
        res3[str(num)]=res_prov
    except:
        print('Erreur')
    print(i/len(L))

  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.84].wav', 'new_audio[2.84, 3.2].wav', 'new_audio[3.2, 4.34].wav', 'new_audio[4.34, 6.08].wav', 'new_audio[6.08, 6.62].wav', 'new_audio[6.62, 7.94].wav', 'new_audio[7.94, 8.84].wav', 'new_audio[8.84, 10.64].wav'], [('4', 0, 1.46), ('6', 1.46, 2.84), ('7', 2.84, 3.2), ('0', 3.2, 4.34), ('1', 4.34, 6.08), ('7', 6.08, 6.62), ('2', 6.62, 7.94), ('5', 7.94, 8.84), ('3', 8.84, 10.64)]]




0.0


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 5.42].wav', 'new_audio[5.42, 6.2].wav', 'new_audio[6.2, 9.32].wav', 'new_audio[9.32, 15.74].wav'], [('1', 0, 5.42), ('0', 5.42, 6.2), ('1', 6.2, 9.32), ('0', 9.32, 15.74)]]




0.004032258064516129


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.94].wav', 'new_audio[1.94, 3.26].wav', 'new_audio[3.26, 4.58].wav', 'new_audio[4.58, 5.96].wav', 'new_audio[5.96, 7.76].wav', 'new_audio[7.76, 7.82].wav', 'new_audio[7.82, 9.5].wav'], [('1', 0, 1.94), ('5', 1.94, 3.26), ('0', 3.26, 4.58), ('4', 4.58, 5.96), ('3', 5.96, 7.76), ('0', 7.76, 7.82), ('2', 7.82, 9.5)]]


  return f(*args, **kwargs)


0.008064516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.012096774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.24].wav', 'new_audio[2.24, 3.14].wav', 'new_audio[3.14, 4.34].wav', 'new_audio[4.34, 5.48].wav', 'new_audio[5.48, 7.34].wav', 'new_audio[7.34, 8.3].wav'], [('3', 0, 2.24), ('5', 2.24, 3.14), ('0', 3.14, 4.34), ('4', 4.34, 5.48), ('1', 5.48, 7.34), ('2', 7.34, 8.3)]]




0.016129032258064516


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.44].wav'], [('1', 0, 2.0), ('0', 2.0, 3.44)]]




0.020161290322580645


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.024193548387096774


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.028225806451612902


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.03225806451612903


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.72].wav', 'new_audio[2.72, 3.14].wav', 'new_audio[3.14, 4.52].wav'], [('2', 0, 2.12), ('0', 2.12, 2.72), ('3', 2.72, 3.14), ('1', 3.14, 4.52)]]




0.036290322580645164


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.38].wav', 'new_audio[3.38, 5.0].wav', 'new_audio[5.0, 6.62].wav'], [('3', 0, 2.0), ('1', 2.0, 3.38), ('2', 3.38, 5.0), ('0', 5.0, 6.62)]]




0.04032258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.9].wav', 'new_audio[2.9, 4.04].wav'], [('0', 0, 2.12), ('2', 2.12, 2.9), ('1', 2.9, 4.04)]]




0.04435483870967742


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.04838709677419355


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.05241935483870968


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.96].wav', 'new_audio[2.96, 3.56].wav', 'new_audio[3.56, 4.52].wav', 'new_audio[4.52, 6.02].wav', 'new_audio[6.02, 7.1].wav', 'new_audio[7.1, 7.76].wav', 'new_audio[7.76, 9.5].wav', 'new_audio[9.5, 10.22].wav', 'new_audio[10.22, 11.9].wav'], [('6', 0, 1.46), ('5', 1.46, 2.96), ('0', 2.96, 3.56), ('8', 3.56, 4.52), ('2', 4.52, 6.02), ('7', 6.02, 7.1), ('1', 7.1, 7.76), ('3', 7.76, 9.5), ('1', 9.5, 10.22), ('4', 10.22, 11.9)]]




0.056451612903225805


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 4.46].wav', 'new_audio[4.46, 5.48].wav', 'new_audio[5.48, 7.52].wav', 'new_audio[7.52, 7.76].wav', 'new_audio[7.76, 9.5].wav', 'new_audio[9.5, 10.7].wav'], [('5', 0, 3.02), ('0', 3.02, 4.46), ('4', 4.46, 5.48), ('1', 5.48, 7.52), ('0', 7.52, 7.76), ('2', 7.76, 9.5), ('3', 9.5, 10.7)]]




0.06048387096774194


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 3.56].wav', 'new_audio[3.56, 4.76].wav', 'new_audio[4.76, 5.3].wav', 'new_audio[5.3, 6.74].wav', 'new_audio[6.74, 8.54].wav', 'new_audio[8.54, 10.28].wav', 'new_audio[10.28, 11.72].wav', 'new_audio[11.72, 13.16].wav', 'new_audio[13.16, 15.38].wav'], [('4', 0, 1.82), ('1', 1.82, 3.56), ('8', 3.56, 4.76), ('2', 4.76, 5.3), ('6', 5.3, 6.74), ('7', 6.74, 8.54), ('0', 8.54, 10.28), ('3', 10.28, 11.72), ('5', 11.72, 13.16), ('2', 13.16, 15.38)]]




0.06451612903225806


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.7].wav', 'new_audio[1.7, 2.3].wav', 'new_audio[2.3, 3.08].wav', 'new_audio[3.08, 4.82].wav', 'new_audio[4.82, 6.32].wav', 'new_audio[6.32, 7.52].wav', 'new_audio[7.52, 8.06].wav', 'new_audio[8.06, 10.04].wav'], [('5', 0, 1.7), ('6', 1.7, 2.3), ('0', 2.3, 3.08), ('3', 3.08, 4.82), ('2', 4.82, 6.32), ('4', 6.32, 7.52), ('7', 7.52, 8.06), ('1', 8.06, 10.04)]]




0.06854838709677419


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.06].wav', 'new_audio[2.06, 3.38].wav', 'new_audio[3.38, 4.76].wav', 'new_audio[4.76, 6.32].wav', 'new_audio[6.32, 7.34].wav', 'new_audio[7.34, 7.58].wav', 'new_audio[7.58, 8.84].wav', 'new_audio[8.84, 10.1].wav'], [('2', 0, 1.46), ('0', 1.46, 2.06), ('4', 2.06, 3.38), ('5', 3.38, 4.76), ('1', 4.76, 6.32), ('6', 6.32, 7.34), ('0', 7.34, 7.58), ('7', 7.58, 8.84), ('3', 8.84, 10.1)]]




0.07258064516129033


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.07661290322580645


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.92].wav', 'new_audio[3.92, 5.18].wav', 'new_audio[5.18, 6.8].wav', 'new_audio[6.8, 6.92].wav', 'new_audio[6.92, 8.84].wav', 'new_audio[8.84, 10.52].wav', 'new_audio[10.52, 12.2].wav', 'new_audio[12.2, 13.4].wav', 'new_audio[13.4, 15.38].wav'], [('6', 0, 1.88), ('1', 1.88, 3.92), ('5', 3.92, 5.18), ('3', 5.18, 6.8), ('8', 6.8, 6.92), ('2', 6.92, 8.84), ('8', 8.84, 10.52), ('0', 10.52, 12.2), ('7', 12.2, 13.4), ('4', 13.4, 15.38)]]




0.08064516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.34].wav', 'new_audio[1.34, 3.26].wav', 'new_audio[3.26, 3.62].wav'], [('2', 0, 1.34), ('1', 1.34, 3.26), ('0', 3.26, 3.62)]]




0.0846774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.96].wav', 'new_audio[2.96, 4.34].wav', 'new_audio[4.34, 6.2].wav', 'new_audio[6.2, 6.68].wav', 'new_audio[6.68, 8.78].wav'], [('2', 0, 2.96), ('1', 2.96, 4.34), ('3', 4.34, 6.2), ('1', 6.2, 6.68), ('0', 6.68, 8.78)]]




0.08870967741935484


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 5.42].wav', 'new_audio[5.42, 7.82].wav', 'new_audio[7.82, 10.52].wav'], [('3', 0, 3.02), ('1', 3.02, 5.42), ('0', 5.42, 7.82), ('2', 7.82, 10.52)]]




0.09274193548387097


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.34].wav', 'new_audio[1.34, 1.7].wav'], [('1', 0, 1.34), ('0', 1.34, 1.7)]]




0.0967741935483871


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.10080645161290322


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 8.96].wav', 'new_audio[8.96, 9.08].wav', 'new_audio[9.08, 14.72].wav', 'new_audio[14.72, 18.92].wav', 'new_audio[18.92, 19.04].wav', 'new_audio[19.04, 19.16].wav', 'new_audio[19.16, 21.74].wav', 'new_audio[21.74, 28.58].wav'], [('0', 0, 8.96), ('1', 8.96, 9.08), ('0', 9.08, 14.72), ('1', 14.72, 18.92), ('0', 18.92, 19.04), ('1', 19.04, 19.16), ('0', 19.16, 21.74), ('1', 21.74, 28.58)]]




0.10483870967741936


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.72].wav', 'new_audio[2.72, 4.76].wav', 'new_audio[4.76, 5.12].wav', 'new_audio[5.12, 6.2].wav', 'new_audio[6.2, 8.3].wav', 'new_audio[8.3, 9.98].wav', 'new_audio[9.98, 10.82].wav', 'new_audio[10.82, 13.04].wav', 'new_audio[13.04, 14.96].wav', 'new_audio[14.96, 15.02].wav', 'new_audio[15.02, 15.08].wav', 'new_audio[15.08, 16.94].wav', 'new_audio[16.94, 17.9].wav', 'new_audio[17.9, 19.64].wav', 'new_audio[19.64, 20.36].wav', 'new_audio[20.36, 20.42].wav', 'new_audio[20.42, 20.48].wav', 'new_audio[20.48, 21.44].wav', 'new_audio[21.44, 21.5].wav', 'new_audio[21.5, 21.86].wav', 'new_audio[21.86, 23.66].wav'], [('7', 0, 2.72), ('6', 2.72, 4.76), ('4', 4.76, 5.12), ('9', 5.12, 6.2), ('10', 6.2, 8.3), ('2', 8.3, 9.98), ('4', 9.98, 10.82), ('8', 10.82, 13.04), ('5', 13.04, 14.96), ('6', 14.96, 15.02), ('8', 15.02, 15.08), ('1', 15.08, 16.94), ('4', 16.94, 17.9), ('0', 17.9, 19.64), ('4', 19.64, 20.36), ('1', 20.36, 20.42), ('9', 20.42, 20.48), ('1', 20.48, 21.44), ('3', 21.44,

  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.10887096774193548


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 4.76].wav'], [('0', 0, 3.02), ('1', 3.02, 4.76)]]




0.11290322580645161


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 2.12].wav', 'new_audio[2.12, 2.9].wav', 'new_audio[2.9, 5.24].wav', 'new_audio[5.24, 6.02].wav', 'new_audio[6.02, 7.52].wav'], [('4', 0, 2.0), ('0', 2.0, 2.12), ('3', 2.12, 2.9), ('2', 2.9, 5.24), ('0', 5.24, 6.02), ('1', 6.02, 7.52)]]




0.11693548387096774


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.2].wav', 'new_audio[3.2, 3.26].wav', 'new_audio[3.26, 10.52].wav', 'new_audio[10.52, 12.32].wav', 'new_audio[12.32, 12.44].wav', 'new_audio[12.44, 14.42].wav', 'new_audio[14.42, 15.62].wav', 'new_audio[15.62, 24.56].wav'], [('1', 0, 3.2), ('0', 3.2, 3.26), ('1', 3.26, 10.52), ('0', 10.52, 12.32), ('1', 12.32, 12.44), ('0', 12.44, 14.42), ('1', 14.42, 15.62), ('0', 15.62, 24.56)]]


  return f(*args, **kwargs)


0.12096774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.52].wav', 'new_audio[1.52, 3.5].wav'], [('1', 0, 1.52), ('0', 1.52, 3.5)]]




0.125


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.04].wav', 'new_audio[1.04, 2.0].wav'], [('1', 0, 1.04), ('0', 1.04, 2.0)]]




0.12903225806451613


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.9].wav', 'new_audio[2.9, 6.56].wav', 'new_audio[6.56, 10.82].wav', 'new_audio[10.82, 12.56].wav', 'new_audio[12.56, 13.94].wav'], [('4', 0, 2.9), ('0', 2.9, 6.56), ('1', 6.56, 10.82), ('3', 10.82, 12.56), ('2', 12.56, 13.94)]]




0.13306451612903225


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 0.92].wav', 'new_audio[0.92, 1.04].wav', 'new_audio[1.04, 2.24].wav', 'new_audio[2.24, 4.4].wav', 'new_audio[4.4, 5.3].wav', 'new_audio[5.3, 7.04].wav', 'new_audio[7.04, 7.16].wav', 'new_audio[7.16, 7.22].wav', 'new_audio[7.22, 8.48].wav', 'new_audio[8.48, 10.64].wav', 'new_audio[10.64, 11.6].wav'], [('2', 0, 0.92), ('4', 0.92, 1.04), ('2', 1.04, 2.24), ('0', 2.24, 4.4), ('5', 4.4, 5.3), ('1', 5.3, 7.04), ('2', 7.04, 7.16), ('5', 7.16, 7.22), ('3', 7.22, 8.48), ('4', 8.48, 10.64), ('5', 10.64, 11.6)]]


  return f(*args, **kwargs)


0.13709677419354838


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.84].wav', 'new_audio[2.84, 3.98].wav'], [('1', 0, 2.84), ('0', 2.84, 3.98)]]




0.14112903225806453


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.78].wav', 'new_audio[2.78, 3.68].wav', 'new_audio[3.68, 5.42].wav', 'new_audio[5.42, 6.44].wav', 'new_audio[6.44, 6.86].wav', 'new_audio[6.86, 7.4].wav', 'new_audio[7.4, 9.14].wav', 'new_audio[9.14, 10.46].wav'], [('2', 0, 2.12), ('3', 2.12, 2.78), ('5', 2.78, 3.68), ('3', 3.68, 5.42), ('6', 5.42, 6.44), ('4', 6.44, 6.86), ('1', 6.86, 7.4), ('7', 7.4, 9.14), ('0', 9.14, 10.46)]]




0.14516129032258066


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.36].wav', 'new_audio[2.36, 2.66].wav', 'new_audio[2.66, 3.44].wav', 'new_audio[3.44, 3.5].wav', 'new_audio[3.5, 5.18].wav', 'new_audio[5.18, 5.72].wav', 'new_audio[5.72, 6.5].wav'], [('2', 0, 2.36), ('5', 2.36, 2.66), ('3', 2.66, 3.44), ('5', 3.44, 3.5), ('1', 3.5, 5.18), ('0', 5.18, 5.72), ('4', 5.72, 6.5)]]


  return f(*args, **kwargs)


0.14919354838709678


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.14].wav', 'new_audio[3.14, 3.2].wav', 'new_audio[3.2, 3.32].wav', 'new_audio[3.32, 4.4].wav', 'new_audio[4.4, 13.04].wav', 'new_audio[13.04, 13.94].wav', 'new_audio[13.94, 18.98].wav', 'new_audio[18.98, 19.52].wav', 'new_audio[19.52, 20.54].wav', 'new_audio[20.54, 21.8].wav', 'new_audio[21.8, 26.48].wav', 'new_audio[26.48, 27.14].wav', 'new_audio[27.14, 31.04].wav', 'new_audio[31.04, 32.66].wav', 'new_audio[32.66, 36.74].wav', 'new_audio[36.74, 38.18].wav', 'new_audio[38.18, 42.98].wav', 'new_audio[42.98, 43.76].wav', 'new_audio[43.76, 46.52].wav', 'new_audio[46.52, 46.64].wav', 'new_audio[46.64, 55.28].wav', 'new_audio[55.28, 61.94].wav', 'new_audio[61.94, 63.68].wav'], [('0', 0, 3.14), ('1', 3.14, 3.2), ('0', 3.2, 3.32), ('1', 3.32, 4.4), ('0', 4.4, 13.04), ('1', 13.04, 13.94), ('0', 13.94, 18.98), ('1', 18.98, 19.52), ('0', 19.52, 20.54), ('1', 20.54, 21.8), ('0', 21.8, 26.48), ('1', 26.48, 27.14), ('0', 27.14, 31.04), ('1', 31.04, 32.66), ('0', 32.66, 36.74), ('1'

  return f(*args, **kwargs)




0.1532258064516129


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 2.9].wav', 'new_audio[2.9, 4.58].wav', 'new_audio[4.58, 5.84].wav', 'new_audio[5.84, 6.86].wav'], [('0', 0, 1.88), ('3', 1.88, 2.9), ('2', 2.9, 4.58), ('1', 4.58, 5.84), ('4', 5.84, 6.86)]]




0.15725806451612903


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.24].wav', 'new_audio[2.24, 3.02].wav'], [('0', 0, 2.24), ('1', 2.24, 3.02)]]




0.16129032258064516


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.1].wav', 'new_audio[1.1, 2.72].wav', 'new_audio[2.72, 4.46].wav', 'new_audio[4.46, 5.24].wav', 'new_audio[5.24, 5.3].wav', 'new_audio[5.3, 6.98].wav', 'new_audio[6.98, 8.42].wav', 'new_audio[8.42, 9.68].wav', 'new_audio[9.68, 10.7].wav', 'new_audio[10.7, 11.12].wav', 'new_audio[11.12, 11.24].wav', 'new_audio[11.24, 11.54].wav', 'new_audio[11.54, 11.66].wav', 'new_audio[11.66, 13.28].wav', 'new_audio[13.28, 14.36].wav', 'new_audio[14.36, 15.62].wav'], [('4', 0, 1.1), ('0', 1.1, 2.72), ('10', 2.72, 4.46), ('5', 4.46, 5.24), ('6', 5.24, 5.3), ('2', 5.3, 6.98), ('1', 6.98, 8.42), ('3', 8.42, 9.68), ('7', 9.68, 10.7), ('5', 10.7, 11.12), ('0', 11.12, 11.24), ('5', 11.24, 11.54), ('0', 11.54, 11.66), ('6', 11.66, 13.28), ('9', 13.28, 14.36), ('8', 14.36, 15.62)]]


  return f(*args, **kwargs)




0.16532258064516128


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.22].wav', 'new_audio[1.22, 2.72].wav'], [('0', 0, 1.22), ('1', 1.22, 2.72)]]




0.1693548387096774


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.17338709677419356


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.6].wav', 'new_audio[2.6, 3.92].wav'], [('1', 0, 2.6), ('0', 2.6, 3.92)]]




0.1774193548387097


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 3.26].wav', 'new_audio[3.26, 3.32].wav', 'new_audio[3.32, 4.88].wav', 'new_audio[4.88, 5.36].wav', 'new_audio[5.36, 7.16].wav', 'new_audio[7.16, 8.06].wav', 'new_audio[8.06, 9.44].wav', 'new_audio[9.44, 10.76].wav', 'new_audio[10.76, 11.42].wav'], [('7', 0, 1.82), ('3', 1.82, 3.26), ('4', 3.26, 3.32), ('1', 3.32, 4.88), ('5', 4.88, 5.36), ('0', 5.36, 7.16), ('8', 7.16, 8.06), ('2', 8.06, 9.44), ('4', 9.44, 10.76), ('6', 10.76, 11.42)]]


  return f(*args, **kwargs)


0.1814516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.18548387096774194


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.94].wav', 'new_audio[1.94, 3.26].wav', 'new_audio[3.26, 4.82].wav', 'new_audio[4.82, 5.72].wav', 'new_audio[5.72, 7.46].wav', 'new_audio[7.46, 8.78].wav', 'new_audio[8.78, 9.62].wav', 'new_audio[9.62, 10.76].wav', 'new_audio[10.76, 12.02].wav', 'new_audio[12.02, 13.58].wav'], [('4', 0, 1.94), ('5', 1.94, 3.26), ('2', 3.26, 4.82), ('8', 4.82, 5.72), ('3', 5.72, 7.46), ('6', 7.46, 8.78), ('9', 8.78, 9.62), ('7', 9.62, 10.76), ('0', 10.76, 12.02), ('1', 12.02, 13.58)]]




0.18951612903225806


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.54].wav', 'new_audio[2.54, 2.78].wav'], [('0', 0, 2.54), ('1', 2.54, 2.78)]]




0.1935483870967742


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.1975806451612903


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.20161290322580644


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.2056451612903226


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.06].wav', 'new_audio[2.06, 2.96].wav', 'new_audio[2.96, 4.04].wav', 'new_audio[4.04, 5.54].wav', 'new_audio[5.54, 6.44].wav', 'new_audio[6.44, 7.04].wav', 'new_audio[7.04, 8.06].wav', 'new_audio[8.06, 9.5].wav', 'new_audio[9.5, 10.28].wav', 'new_audio[10.28, 11.78].wav'], [('4', 0, 2.06), ('2', 2.06, 2.96), ('5', 2.96, 4.04), ('0', 4.04, 5.54), ('8', 5.54, 6.44), ('6', 6.44, 7.04), ('9', 7.04, 8.06), ('1', 8.06, 9.5), ('7', 9.5, 10.28), ('3', 10.28, 11.78)]]




0.20967741935483872


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.14].wav', 'new_audio[3.14, 3.98].wav', 'new_audio[3.98, 4.94].wav', 'new_audio[4.94, 5.9].wav', 'new_audio[5.9, 6.98].wav', 'new_audio[6.98, 7.52].wav', 'new_audio[7.52, 8.36].wav'], [('2', 0, 3.14), ('3', 3.14, 3.98), ('5', 3.98, 4.94), ('0', 4.94, 5.9), ('4', 5.9, 6.98), ('6', 6.98, 7.52), ('1', 7.52, 8.36)]]




0.21370967741935484


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.56].wav'], [('1', 0, 1.88), ('0', 1.88, 3.56)]]




0.21774193548387097


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.2217741935483871


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.1].wav', 'new_audio[1.1, 1.34].wav'], [('1', 0, 1.1), ('0', 1.1, 1.34)]]




0.22580645161290322


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 22.46].wav', 'new_audio[22.46, 26.72].wav', 'new_audio[26.72, 30.14].wav', 'new_audio[30.14, 37.1].wav', 'new_audio[37.1, 54.2].wav', 'new_audio[54.2, 61.64].wav', 'new_audio[61.64, 72.74].wav', 'new_audio[72.74, 74.78].wav', 'new_audio[74.78, 76.28].wav'], [('0', 0, 22.46), ('1', 22.46, 26.72), ('0', 26.72, 30.14), ('1', 30.14, 37.1), ('0', 37.1, 54.2), ('1', 54.2, 61.64), ('0', 61.64, 72.74), ('1', 72.74, 74.78), ('0', 74.78, 76.28)]]




0.22983870967741934


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.52].wav', 'new_audio[1.52, 2.84].wav', 'new_audio[2.84, 4.22].wav', 'new_audio[4.22, 4.64].wav', 'new_audio[4.64, 6.5].wav', 'new_audio[6.5, 8.78].wav', 'new_audio[8.78, 8.84].wav', 'new_audio[8.84, 8.9].wav', 'new_audio[8.9, 9.68].wav'], [('2', 0, 1.52), ('5', 1.52, 2.84), ('0', 2.84, 4.22), ('1', 4.22, 4.64), ('4', 4.64, 6.5), ('1', 6.5, 8.78), ('4', 8.78, 8.84), ('0', 8.84, 8.9), ('3', 8.9, 9.68)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.23387096774193547


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 1.22].wav', 'new_audio[1.22, 1.52].wav'], [('1', 0, 1.22), ('0', 1.22, 1.52)]]




0.23790322580645162


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.58].wav', 'new_audio[1.58, 3.02].wav', 'new_audio[3.02, 5.24].wav', 'new_audio[5.24, 6.2].wav', 'new_audio[6.2, 8.36].wav', 'new_audio[8.36, 9.5].wav'], [('5', 0, 1.58), ('2', 1.58, 3.02), ('0', 3.02, 5.24), ('4', 5.24, 6.2), ('1', 6.2, 8.36), ('3', 8.36, 9.5)]]




0.24193548387096775


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.24596774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.64].wav', 'new_audio[1.64, 3.2].wav', 'new_audio[3.2, 5.12].wav', 'new_audio[5.12, 6.2].wav'], [('2', 0, 1.64), ('1', 1.64, 3.2), ('3', 3.2, 5.12), ('0', 5.12, 6.2)]]




0.25


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.06].wav', 'new_audio[2.06, 3.44].wav', 'new_audio[3.44, 5.54].wav', 'new_audio[5.54, 7.28].wav', 'new_audio[7.28, 8.3].wav', 'new_audio[8.3, 9.26].wav', 'new_audio[9.26, 10.7].wav'], [('2', 0, 2.06), ('6', 2.06, 3.44), ('1', 3.44, 5.54), ('4', 5.54, 7.28), ('3', 7.28, 8.3), ('0', 8.3, 9.26), ('5', 9.26, 10.7)]]




0.2540322580645161


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 2.9].wav', 'new_audio[2.9, 3.92].wav', 'new_audio[3.92, 6.14].wav', 'new_audio[6.14, 7.76].wav', 'new_audio[7.76, 8.9].wav'], [('4', 0, 1.82), ('0', 1.82, 2.9), ('5', 2.9, 3.92), ('1', 3.92, 6.14), ('2', 6.14, 7.76), ('3', 7.76, 8.9)]]




0.25806451612903225


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 3.32].wav', 'new_audio[3.32, 5.12].wav', 'new_audio[5.12, 5.3].wav'], [('0', 0, 3.32), ('1', 3.32, 5.12), ('0', 5.12, 5.3)]]




0.2620967741935484


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.16].wav', 'new_audio[1.16, 2.48].wav'], [('1', 0, 1.16), ('0', 1.16, 2.48)]]




0.2661290322580645


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.36].wav', 'new_audio[2.36, 4.16].wav', 'new_audio[4.16, 5.9].wav', 'new_audio[5.9, 7.88].wav', 'new_audio[7.88, 9.14].wav'], [('2', 0, 2.36), ('0', 2.36, 4.16), ('3', 4.16, 5.9), ('1', 5.9, 7.88), ('0', 7.88, 9.14)]]




0.2701612903225806


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.27419354838709675


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.4].wav', 'new_audio[1.4, 2.18].wav', 'new_audio[2.18, 2.24].wav', 'new_audio[2.24, 2.3].wav', 'new_audio[2.3, 2.9].wav', 'new_audio[2.9, 2.96].wav', 'new_audio[2.96, 3.02].wav', 'new_audio[3.02, 3.14].wav', 'new_audio[3.14, 3.32].wav', 'new_audio[3.32, 3.44].wav', 'new_audio[3.44, 3.62].wav'], [('5', 0, 1.4), ('2', 1.4, 2.18), ('8', 2.18, 2.24), ('6', 2.24, 2.3), ('1', 2.3, 2.9), ('7', 2.9, 2.96), ('3', 2.96, 3.02), ('4', 3.02, 3.14), ('3', 3.14, 3.32), ('4', 3.32, 3.44), ('0', 3.44, 3.62)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.2782258064516129


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.52].wav', 'new_audio[1.52, 2.9].wav', 'new_audio[2.9, 4.28].wav', 'new_audio[4.28, 5.9].wav', 'new_audio[5.9, 6.86].wav', 'new_audio[6.86, 8.72].wav', 'new_audio[8.72, 9.5].wav', 'new_audio[9.5, 10.34].wav', 'new_audio[10.34, 11.66].wav', 'new_audio[11.66, 12.86].wav'], [('5', 0, 1.52), ('0', 1.52, 2.9), ('3', 2.9, 4.28), ('1', 4.28, 5.9), ('6', 5.9, 6.86), ('4', 6.86, 8.72), ('9', 8.72, 9.5), ('2', 9.5, 10.34), ('7', 10.34, 11.66), ('8', 11.66, 12.86)]]




0.28225806451612906


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 6.08].wav', 'new_audio[6.08, 8.24].wav', 'new_audio[8.24, 9.2].wav'], [('0', 0, 6.08), ('1', 6.08, 8.24), ('0', 8.24, 9.2)]]




0.2862903225806452


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.2903225806451613


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.06].wav', 'new_audio[2.06, 4.7].wav', 'new_audio[4.7, 6.38].wav'], [('0', 0, 2.06), ('1', 2.06, 4.7), ('0', 4.7, 6.38)]]




0.29435483870967744


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.48].wav', 'new_audio[2.48, 3.44].wav', 'new_audio[3.44, 5.6].wav', 'new_audio[5.6, 6.38].wav', 'new_audio[6.38, 7.88].wav', 'new_audio[7.88, 8.48].wav', 'new_audio[8.48, 9.98].wav', 'new_audio[9.98, 12.38].wav', 'new_audio[12.38, 13.28].wav'], [('4', 0, 2.48), ('8', 2.48, 3.44), ('1', 3.44, 5.6), ('5', 5.6, 6.38), ('2', 6.38, 7.88), ('3', 7.88, 8.48), ('7', 8.48, 9.98), ('0', 9.98, 12.38), ('6', 12.38, 13.28)]]




0.29838709677419356


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.66].wav', 'new_audio[2.66, 3.92].wav', 'new_audio[3.92, 5.12].wav', 'new_audio[5.12, 7.4].wav', 'new_audio[7.4, 9.38].wav'], [('2', 0, 2.66), ('0', 2.66, 3.92), ('4', 3.92, 5.12), ('1', 5.12, 7.4), ('3', 7.4, 9.38)]]




0.3024193548387097


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.98].wav', 'new_audio[0.98, 2.24].wav'], [('1', 0, 0.98), ('0', 0.98, 2.24)]]




0.3064516129032258


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.03 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.36].wav', 'new_audio[2.36, 4.4].wav', 'new_audio[4.4, 6.14].wav', 'new_audio[6.14, 7.28].wav'], [('2', 0, 2.36), ('0', 2.36, 4.4), ('1', 4.4, 6.14), ('3', 6.14, 7.28)]]




0.31048387096774194


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.31451612903225806


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.48].wav', 'new_audio[2.48, 5.3].wav'], [('1', 0, 2.48), ('0', 2.48, 5.3)]]




0.3185483870967742


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.3225806451612903


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.3].wav', 'new_audio[2.3, 3.44].wav', 'new_audio[3.44, 4.7].wav'], [('0', 0, 2.3), ('1', 2.3, 3.44), ('2', 3.44, 4.7)]]




0.32661290322580644


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.33064516129032256


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.42].wav', 'new_audio[2.42, 3.8].wav', 'new_audio[3.8, 5.06].wav'], [('2', 0, 2.42), ('0', 2.42, 3.8), ('1', 3.8, 5.06)]]




0.3346774193548387


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.1].wav', 'new_audio[1.1, 1.4].wav'], [('1', 0, 1.1), ('0', 1.1, 1.4)]]




0.3387096774193548


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.24].wav', 'new_audio[2.24, 2.78].wav', 'new_audio[2.78, 3.62].wav'], [('0', 0, 2.24), ('2', 2.24, 2.78), ('1', 2.78, 3.62)]]




0.34274193548387094


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.98].wav', 'new_audio[0.98, 1.22].wav'], [('1', 0, 0.98), ('0', 0.98, 1.22)]]




0.3467741935483871


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.35080645161290325


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.52].wav', 'new_audio[1.52, 3.38].wav', 'new_audio[3.38, 3.44].wav'], [('0', 0, 1.52), ('1', 1.52, 3.38), ('0', 3.38, 3.44)]]


  return f(*args, **kwargs)


0.3548387096774194


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.1].wav', 'new_audio[1.1, 2.6].wav', 'new_audio[2.6, 4.52].wav'], [('1', 0, 1.1), ('0', 1.1, 2.6), ('2', 2.6, 4.52)]]




0.3588709677419355


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 3.14].wav', 'new_audio[3.14, 3.98].wav'], [('1', 0, 3.14), ('0', 3.14, 3.98)]]




0.3629032258064516


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.32].wav', 'new_audio[3.32, 4.22].wav', 'new_audio[4.22, 4.34].wav', 'new_audio[4.34, 4.46].wav', 'new_audio[4.46, 5.66].wav', 'new_audio[5.66, 6.68].wav', 'new_audio[6.68, 8.42].wav', 'new_audio[8.42, 12.2].wav'], [('0', 0, 3.32), ('1', 3.32, 4.22), ('0', 4.22, 4.34), ('1', 4.34, 4.46), ('0', 4.46, 5.66), ('1', 5.66, 6.68), ('0', 6.68, 8.42), ('1', 8.42, 12.2)]]




0.36693548387096775


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.72].wav', 'new_audio[2.72, 3.56].wav'], [('0', 0, 2.72), ('1', 2.72, 3.56)]]




0.3709677419354839


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.375


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.16].wav', 'new_audio[1.16, 2.36].wav'], [('1', 0, 1.16), ('0', 1.16, 2.36)]]




0.3790322580645161


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.08].wav', 'new_audio[3.08, 5.06].wav', 'new_audio[5.06, 7.94].wav'], [('2', 0, 3.08), ('0', 3.08, 5.06), ('1', 5.06, 7.94)]]




0.38306451612903225


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.6].wav', 'new_audio[2.6, 4.58].wav', 'new_audio[4.58, 6.62].wav'], [('2', 0, 2.6), ('0', 2.6, 4.58), ('1', 4.58, 6.62)]]




0.3870967741935484


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.3911290322580645


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.28].wav', 'new_audio[1.28, 2.42].wav', 'new_audio[2.42, 3.38].wav', 'new_audio[3.38, 4.76].wav', 'new_audio[4.76, 6.2].wav', 'new_audio[6.2, 6.56].wav', 'new_audio[6.56, 7.52].wav', 'new_audio[7.52, 8.84].wav', 'new_audio[8.84, 9.5].wav'], [('5', 0, 1.28), ('3', 1.28, 2.42), ('8', 2.42, 3.38), ('4', 3.38, 4.76), ('2', 4.76, 6.2), ('0', 6.2, 6.56), ('7', 6.56, 7.52), ('1', 7.52, 8.84), ('6', 8.84, 9.5)]]




0.3951612903225806


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.34].wav', 'new_audio[1.34, 1.94].wav', 'new_audio[1.94, 2.6].wav', 'new_audio[2.6, 4.34].wav', 'new_audio[4.34, 5.18].wav', 'new_audio[5.18, 6.14].wav', 'new_audio[6.14, 7.28].wav', 'new_audio[7.28, 8.42].wav', 'new_audio[8.42, 9.2].wav'], [('4', 0, 1.34), ('2', 1.34, 1.94), ('8', 1.94, 2.6), ('0', 2.6, 4.34), ('3', 4.34, 5.18), ('5', 5.18, 6.14), ('7', 6.14, 7.28), ('1', 7.28, 8.42), ('6', 8.42, 9.2)]]




0.39919354838709675


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.94].wav', 'new_audio[1.94, 3.08].wav'], [('0', 0, 1.94), ('1', 1.94, 3.08)]]




0.4032258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 1.22].wav', 'new_audio[1.22, 1.64].wav'], [('1', 0, 1.22), ('0', 1.22, 1.64)]]




0.40725806451612906


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 0.98].wav', 'new_audio[0.98, 1.04].wav', 'new_audio[1.04, 1.1].wav', 'new_audio[1.1, 1.16].wav', 'new_audio[1.16, 1.22].wav', 'new_audio[1.22, 1.28].wav', 'new_audio[1.28, 1.76].wav', 'new_audio[1.76, 1.82].wav', 'new_audio[1.82, 1.88].wav', 'new_audio[1.88, 1.94].wav', 'new_audio[1.94, 2.12].wav', 'new_audio[2.12, 2.18].wav', 'new_audio[2.18, 2.24].wav'], [('2', 0, 0.98), ('8', 0.98, 1.04), ('9', 1.04, 1.1), ('10', 1.1, 1.16), ('6', 1.16, 1.22), ('12', 1.22, 1.28), ('0', 1.28, 1.76), ('7', 1.76, 1.82), ('11', 1.82, 1.88), ('4', 1.88, 1.94), ('3', 1.94, 2.12), ('1', 2.12, 2.18), ('5', 2.18, 2.24)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)


0.4112903225806452


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.4153225806451613


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 0.98].wav', 'new_audio[0.98, 1.04].wav', 'new_audio[1.04, 1.1].wav', 'new_audio[1.1, 1.34].wav', 'new_audio[1.34, 1.4].wav', 'new_audio[1.4, 1.52].wav', 'new_audio[1.52, 1.7].wav'], [('3', 0, 0.98), ('5', 0.98, 1.04), ('6', 1.04, 1.1), ('1', 1.1, 1.34), ('4', 1.34, 1.4), ('2', 1.4, 1.52), ('0', 1.52, 1.7)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.41935483870967744


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.76].wav', 'new_audio[1.76, 2.9].wav'], [('0', 0, 1.76), ('1', 1.76, 2.9)]]




0.42338709677419356


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 4.34].wav', 'new_audio[4.34, 5.66].wav', 'new_audio[5.66, 6.62].wav'], [('0', 0, 4.34), ('2', 4.34, 5.66), ('1', 5.66, 6.62)]]




0.4274193548387097


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.4314516129032258


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.06].wav', 'new_audio[2.06, 2.96].wav', 'new_audio[2.96, 3.5].wav', 'new_audio[3.5, 3.86].wav'], [('3', 0, 2.06), ('1', 2.06, 2.96), ('2', 2.96, 3.5), ('0', 3.5, 3.86)]]




0.43548387096774194


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.43951612903225806


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.4435483870967742


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 1.1].wav', 'new_audio[1.1, 1.28].wav'], [('1', 0, 1.1), ('0', 1.1, 1.28)]]




0.4475806451612903


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.76].wav', 'new_audio[1.76, 2.78].wav', 'new_audio[2.78, 4.4].wav', 'new_audio[4.4, 5.18].wav'], [('2', 0, 1.76), ('3', 1.76, 2.78), ('1', 2.78, 4.4), ('0', 4.4, 5.18)]]




0.45161290322580644


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.
Erreur
0.45564516129032256


  frames = librosa.feature.melspectrogram(
  refined_affinity /= np.expand_dims(row_max, axis=1)
  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.04].wav', 'new_audio[1.04, 1.28].wav'], [('1', 0, 1.04), ('0', 1.04, 1.28)]]




0.4596774193548387


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.58].wav', 'new_audio[1.58, 1.94].wav'], [('0', 0, 1.58), ('1', 1.58, 1.94)]]




0.4637096774193548


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.46774193548387094


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.36 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 2.84].wav', 'new_audio[2.84, 3.8].wav', 'new_audio[3.8, 3.98].wav', 'new_audio[3.98, 5.36].wav', 'new_audio[5.36, 6.38].wav'], [('4', 0, 1.88), ('3', 1.88, 2.84), ('2', 2.84, 3.8), ('3', 3.8, 3.98), ('1', 3.98, 5.36), ('0', 5.36, 6.38)]]




0.4717741935483871


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 3.2].wav', 'new_audio[3.2, 4.58].wav', 'new_audio[4.58, 5.78].wav', 'new_audio[5.78, 7.1].wav', 'new_audio[7.1, 8.6].wav', 'new_audio[8.6, 9.98].wav', 'new_audio[9.98, 11.12].wav', 'new_audio[11.12, 12.38].wav', 'new_audio[12.38, 13.82].wav', 'new_audio[13.82, 15.26].wav', 'new_audio[15.26, 16.7].wav', 'new_audio[16.7, 18.32].wav', 'new_audio[18.32, 19.28].wav', 'new_audio[19.28, 19.4].wav', 'new_audio[19.4, 20.66].wav'], [('7', 0, 1.82), ('9', 1.82, 3.2), ('3', 3.2, 4.58), ('14', 4.58, 5.78), ('6', 5.78, 7.1), ('11', 7.1, 8.6), ('0', 8.6, 9.98), ('4', 9.98, 11.12), ('1', 11.12, 12.38), ('13', 12.38, 13.82), ('12', 13.82, 15.26), ('2', 15.26, 16.7), ('5', 16.7, 18.32), ('8', 18.32, 19.28), ('4', 19.28, 19.4), ('10', 19.4, 20.66)]]






0.47580645161290325


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 6.02].wav', 'new_audio[6.02, 6.98].wav', 'new_audio[6.98, 8.78].wav', 'new_audio[8.78, 8.84].wav', 'new_audio[8.84, 9.02].wav', 'new_audio[9.02, 10.88].wav', 'new_audio[10.88, 12.44].wav', 'new_audio[12.44, 14.3].wav', 'new_audio[14.3, 16.76].wav', 'new_audio[16.76, 17.84].wav', 'new_audio[17.84, 19.82].wav', 'new_audio[19.82, 19.82].wav'], [('0', 0, 6.02), ('1', 6.02, 6.98), ('4', 6.98, 8.78), ('1', 8.78, 8.84), ('4', 8.84, 9.02), ('1', 9.02, 10.88), ('6', 10.88, 12.44), ('2', 12.44, 14.3), ('3', 14.3, 16.76), ('1', 16.76, 17.84), ('5', 17.84, 19.82), ('2', 19.82, 19.82)]]


  return f(*args, **kwargs)


Erreur
0.4798387096774194


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.24].wav', 'new_audio[2.24, 3.92].wav', 'new_audio[3.92, 4.94].wav', 'new_audio[4.94, 5.54].wav', 'new_audio[5.54, 6.86].wav'], [('0', 0, 2.24), ('1', 2.24, 3.92), ('3', 3.92, 4.94), ('2', 4.94, 5.54), ('4', 5.54, 6.86)]]




0.4838709677419355


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.22].wav', 'new_audio[1.22, 1.82].wav', 'new_audio[1.82, 3.08].wav', 'new_audio[3.08, 3.62].wav', 'new_audio[3.62, 4.7].wav', 'new_audio[4.7, 6.38].wav', 'new_audio[6.38, 7.22].wav', 'new_audio[7.22, 8.96].wav', 'new_audio[8.96, 9.56].wav', 'new_audio[9.56, 10.82].wav', 'new_audio[10.82, 11.24].wav', 'new_audio[11.24, 12.68].wav', 'new_audio[12.68, 12.74].wav', 'new_audio[12.74, 13.16].wav', 'new_audio[13.16, 13.7].wav', 'new_audio[13.7, 15.08].wav', 'new_audio[15.08, 15.26].wav', 'new_audio[15.26, 15.44].wav', 'new_audio[15.44, 15.68].wav', 'new_audio[15.68, 16.28].wav', 'new_audio[16.28, 16.34].wav', 'new_audio[16.34, 16.4].wav', 'new_audio[16.4, 16.94].wav', 'new_audio[16.94, 18.32].wav', 'new_audio[18.32, 18.98].wav', 'new_audio[18.98, 20.18].wav', 'new_audio[20.18, 21.08].wav', 'new_audio[21.08, 21.8].wav'], [('19', 0, 1.22), ('7', 1.22, 1.82), ('8', 1.82, 3.08), ('18', 3.08, 3.62), ('11', 3.62, 4.7), ('13', 4.7, 6.38), ('6', 6.38, 7.22), ('5', 7.22, 8.96), ('16',

  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.4879032258064516


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.16].wav', 'new_audio[1.16, 1.58].wav'], [('1', 0, 1.16), ('0', 1.16, 1.58)]]




0.49193548387096775


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.04].wav', 'new_audio[1.04, 1.22].wav'], [('1', 0, 1.04), ('0', 1.04, 1.22)]]




0.4959677419354839


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.5


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.86].wav', 'new_audio[0.86, 0.86].wav'], [('0', 0, 0.86), ('1', 0.86, 0.86)]]
Erreur
0.5040322580645161


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.04].wav', 'new_audio[1.04, 1.1].wav', 'new_audio[1.1, 1.22].wav', 'new_audio[1.22, 1.4].wav', 'new_audio[1.4, 1.58].wav', 'new_audio[1.58, 1.7].wav', 'new_audio[1.7, 1.82].wav', 'new_audio[1.82, 1.88].wav', 'new_audio[1.88, 1.94].wav', 'new_audio[1.94, 2.42].wav', 'new_audio[2.42, 2.48].wav', 'new_audio[2.48, 2.54].wav', 'new_audio[2.54, 2.66].wav', 'new_audio[2.66, 2.72].wav', 'new_audio[2.72, 2.78].wav', 'new_audio[2.78, 2.84].wav', 'new_audio[2.84, 2.96].wav', 'new_audio[2.96, 3.08].wav', 'new_audio[3.08, 3.08].wav'], [('0', 0, 1.04), ('11', 1.04, 1.1), ('4', 1.1, 1.22), ('6', 1.22, 1.4), ('7', 1.4, 1.58), ('12', 1.58, 1.7), ('8', 1.7, 1.82), ('15', 1.82, 1.88), ('14', 1.88, 1.94), ('1', 1.94, 2.42), ('18', 2.42, 2.48), ('17', 2.48, 2.54), ('9', 2.54, 2.66), ('16', 2.66, 2.72), ('13', 2.72, 2.78), ('10', 2.78, 2.84), ('3', 2.84, 2.96), ('2', 2.96, 3.08), ('5', 3.08, 3.08)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)


Erreur
0.5080645161290323


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.18].wav', 'new_audio[2.18, 3.44].wav'], [('1', 0, 2.18), ('0', 2.18, 3.44)]]




0.5120967741935484


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 4.64].wav', 'new_audio[4.64, 5.78].wav'], [('0', 0, 4.64), ('1', 4.64, 5.78)]]




0.5161290322580645


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.5].wav', 'new_audio[3.5, 5.0].wav'], [('2', 0, 1.88), ('0', 1.88, 3.5), ('1', 3.5, 5.0)]]




0.5201612903225806


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.98].wav', 'new_audio[0.98, 1.04].wav', 'new_audio[1.04, 1.1].wav', 'new_audio[1.1, 1.16].wav', 'new_audio[1.16, 1.52].wav', 'new_audio[1.52, 1.58].wav', 'new_audio[1.58, 1.64].wav', 'new_audio[1.64, 1.7].wav', 'new_audio[1.7, 1.82].wav'], [('2', 0, 0.98), ('5', 0.98, 1.04), ('8', 1.04, 1.1), ('7', 1.1, 1.16), ('1', 1.16, 1.52), ('0', 1.52, 1.58), ('6', 1.58, 1.64), ('4', 1.64, 1.7), ('3', 1.7, 1.82)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.5241935483870968


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.86].wav', 'new_audio[0.86, 1.22].wav', 'new_audio[1.22, 1.34].wav', 'new_audio[1.34, 1.4].wav', 'new_audio[1.4, 1.46].wav', 'new_audio[1.46, 1.7].wav', 'new_audio[1.7, 1.76].wav', 'new_audio[1.76, 1.82].wav', 'new_audio[1.82, 1.88].wav', 'new_audio[1.88, 2.3].wav', 'new_audio[2.3, 2.36].wav'], [('5', 0, 0.86), ('3', 0.86, 1.22), ('2', 1.22, 1.34), ('8', 1.34, 1.4), ('9', 1.4, 1.46), ('0', 1.46, 1.7), ('7', 1.7, 1.76), ('10', 1.76, 1.82), ('6', 1.82, 1.88), ('1', 1.88, 2.3), ('4', 2.3, 2.36)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.5282258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.86].wav', 'new_audio[0.86, 1.94].wav'], [('1', 0, 0.86), ('0', 0.86, 1.94)]]




0.532258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.66].wav', 'new_audio[2.66, 3.92].wav', 'new_audio[3.92, 5.78].wav'], [('2', 0, 2.66), ('0', 2.66, 3.92), ('1', 3.92, 5.78)]]




0.5362903225806451


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 3.56].wav'], [('0', 0, 1.82), ('1', 1.82, 3.56)]]




0.5403225806451613


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 6.38].wav', 'new_audio[6.38, 9.14].wav', 'new_audio[9.14, 10.4].wav'], [('0', 0, 6.38), ('1', 6.38, 9.14), ('0', 9.14, 10.4)]]




0.5443548387096774


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 0.98].wav', 'new_audio[0.98, 1.16].wav', 'new_audio[1.16, 1.22].wav', 'new_audio[1.22, 1.76].wav', 'new_audio[1.76, 1.82].wav', 'new_audio[1.82, 1.88].wav', 'new_audio[1.88, 2.12].wav', 'new_audio[2.12, 2.18].wav', 'new_audio[2.18, 2.3].wav', 'new_audio[2.3, 2.36].wav', 'new_audio[2.36, 2.42].wav', 'new_audio[2.42, 2.66].wav', 'new_audio[2.66, 2.84].wav', 'new_audio[2.84, 2.96].wav', 'new_audio[2.96, 3.02].wav', 'new_audio[3.02, 3.14].wav', 'new_audio[3.14, 3.26].wav', 'new_audio[3.26, 3.32].wav', 'new_audio[3.32, 3.38].wav', 'new_audio[3.38, 3.44].wav', 'new_audio[3.44, 3.5].wav', 'new_audio[3.5, 3.56].wav', 'new_audio[3.56, 3.62].wav', 'new_audio[3.62, 3.68].wav', 'new_audio[3.68, 3.74].wav', 'new_audio[3.74, 3.8].wav', 'new_audio[3.8, 3.86].wav', 'new_audio[3.86, 4.16].wav'], [('3', 0, 0.98), ('0', 0.98, 1.16), ('3', 1.16, 1.22), ('0', 1.22, 1.76), ('12', 1.76, 1.82), ('17', 1.82, 1.88), ('18', 1.88, 2.12), ('22', 2.12, 2.18), ('15', 2.18, 2.3), ('23', 2.3, 2.36), ('

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.5483870967741935


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.06].wav', 'new_audio[2.06, 3.8].wav'], [('1', 0, 2.06), ('0', 2.06, 3.8)]]




0.5524193548387096


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.86].wav', 'new_audio[0.86, 0.98].wav', 'new_audio[0.98, 1.1].wav', 'new_audio[1.1, 1.16].wav', 'new_audio[1.16, 1.22].wav', 'new_audio[1.22, 1.94].wav', 'new_audio[1.94, 2.0].wav', 'new_audio[2.0, 2.12].wav', 'new_audio[2.12, 2.12].wav'], [('3', 0, 0.86), ('2', 0.86, 0.98), ('3', 0.98, 1.1), ('2', 1.1, 1.16), ('5', 1.16, 1.22), ('0', 1.22, 1.94), ('6', 1.94, 2.0), ('4', 2.0, 2.12), ('1', 2.12, 2.12)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Erreur
0.5564516129032258


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.9].wav', 'new_audio[2.9, 4.04].wav', 'new_audio[4.04, 5.18].wav', 'new_audio[5.18, 6.02].wav', 'new_audio[6.02, 7.46].wav', 'new_audio[7.46, 8.78].wav'], [('2', 0, 2.9), ('5', 2.9, 4.04), ('0', 4.04, 5.18), ('3', 5.18, 6.02), ('4', 6.02, 7.46), ('1', 7.46, 8.78)]]




0.5604838709677419


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.84].wav', 'new_audio[2.84, 5.0].wav', 'new_audio[5.0, 7.76].wav', 'new_audio[7.76, 10.7].wav', 'new_audio[10.7, 13.1].wav', 'new_audio[13.1, 13.22].wav', 'new_audio[13.22, 15.14].wav', 'new_audio[15.14, 15.56].wav'], [('3', 0, 2.84), ('1', 2.84, 5.0), ('2', 5.0, 7.76), ('1', 7.76, 10.7), ('0', 10.7, 13.1), ('2', 13.1, 13.22), ('4', 13.22, 15.14), ('0', 15.14, 15.56)]]




0.5645161290322581


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 4.82].wav', 'new_audio[4.82, 9.26].wav'], [('0', 0, 4.82), ('1', 4.82, 9.26)]]




0.5685483870967742


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.32].wav'], [('0', 0, 2.0), ('1', 2.0, 3.32)]]




0.5725806451612904


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 0.92].wav', 'new_audio[0.92, 1.04].wav', 'new_audio[1.04, 1.16].wav', 'new_audio[1.16, 1.34].wav', 'new_audio[1.34, 1.58].wav', 'new_audio[1.58, 1.7].wav', 'new_audio[1.7, 1.94].wav', 'new_audio[1.94, 2.0].wav', 'new_audio[2.0, 2.06].wav', 'new_audio[2.06, 2.12].wav', 'new_audio[2.12, 2.18].wav', 'new_audio[2.18, 2.24].wav', 'new_audio[2.24, 2.3].wav', 'new_audio[2.3, 2.36].wav', 'new_audio[2.36, 2.66].wav', 'new_audio[2.66, 2.78].wav', 'new_audio[2.78, 2.84].wav', 'new_audio[2.84, 2.96].wav'], [('0', 0, 0.92), ('15', 0.92, 1.04), ('4', 1.04, 1.16), ('7', 1.16, 1.34), ('12', 1.34, 1.58), ('8', 1.58, 1.7), ('6', 1.7, 1.94), ('10', 1.94, 2.0), ('14', 2.0, 2.06), ('9', 2.06, 2.12), ('11', 2.12, 2.18), ('13', 2.18, 2.24), ('1', 2.24, 2.3), ('5', 2.3, 2.36), ('3', 2.36, 2.66), ('2', 2.66, 2.78), ('0', 2.78, 2.84), ('2', 2.84, 2.96)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)


0.5766129032258065


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 0.92].wav', 'new_audio[0.92, 0.98].wav'], [('0', 0, 0.92), ('1', 0.92, 0.98)]]


  return f(*args, **kwargs)


0.5806451612903226


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.5846774193548387


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.7].wav', 'new_audio[1.7, 3.86].wav'], [('1', 0, 1.7), ('0', 1.7, 3.86)]]




0.5887096774193549


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.48].wav', 'new_audio[2.48, 3.44].wav', 'new_audio[3.44, 4.76].wav', 'new_audio[4.76, 4.88].wav'], [('1', 0, 2.12), ('3', 2.12, 2.48), ('2', 2.48, 3.44), ('0', 3.44, 4.76), ('3', 4.76, 4.88)]]




0.592741935483871


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.54].wav', 'new_audio[2.54, 4.4].wav', 'new_audio[4.4, 6.44].wav', 'new_audio[6.44, 6.56].wav', 'new_audio[6.56, 8.3].wav'], [('1', 0, 2.54), ('0', 2.54, 4.4), ('3', 4.4, 6.44), ('0', 6.44, 6.56), ('2', 6.56, 8.3)]]




0.5967741935483871


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.28].wav', 'new_audio[1.28, 3.02].wav', 'new_audio[3.02, 4.58].wav', 'new_audio[4.58, 5.96].wav', 'new_audio[5.96, 8.0].wav', 'new_audio[8.0, 9.38].wav', 'new_audio[9.38, 10.22].wav'], [('3', 0, 1.28), ('5', 1.28, 3.02), ('0', 3.02, 4.58), ('4', 4.58, 5.96), ('1', 5.96, 8.0), ('6', 8.0, 9.38), ('2', 9.38, 10.22)]]




0.6008064516129032


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6048387096774194


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6088709677419355


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(
  refined_affinity /= np.expand_dims(row_max, axis=1)


Loaded the voice encoder model on cpu in 0.01 seconds.
Erreur
0.6129032258064516


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.42].wav', 'new_audio[2.42, 6.2].wav'], [('1', 0, 2.42), ('0', 2.42, 6.2)]]




0.6169354838709677


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6209677419354839


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.625


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6290322580645161


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.04].wav', 'new_audio[1.04, 1.28].wav', 'new_audio[1.28, 1.46].wav', 'new_audio[1.46, 1.76].wav', 'new_audio[1.76, 1.82].wav', 'new_audio[1.82, 1.88].wav', 'new_audio[1.88, 1.94].wav', 'new_audio[1.94, 2.0].wav', 'new_audio[2.0, 2.06].wav', 'new_audio[2.06, 2.12].wav', 'new_audio[2.12, 2.18].wav', 'new_audio[2.18, 2.3].wav', 'new_audio[2.3, 2.48].wav', 'new_audio[2.48, 2.6].wav', 'new_audio[2.6, 2.66].wav', 'new_audio[2.66, 2.72].wav', 'new_audio[2.72, 2.78].wav', 'new_audio[2.78, 2.84].wav', 'new_audio[2.84, 2.9].wav', 'new_audio[2.9, 2.96].wav', 'new_audio[2.96, 3.02].wav', 'new_audio[3.02, 3.86].wav'], [('8', 0, 1.04), ('3', 1.04, 1.28), ('11', 1.28, 1.46), ('1', 1.46, 1.76), ('11', 1.76, 1.82), ('13', 1.82, 1.88), ('10', 1.88, 1.94), ('14', 1.94, 2.0), ('1', 2.0, 2.06), ('15', 2.06, 2.12), ('4', 2.12, 2.18), ('2', 2.18, 2.3), ('7', 2.3, 2.48), ('2', 2.48, 2.6), ('6', 2.6, 2.66), ('17', 2.66, 2.72), ('16', 2.72, 2.78), ('5', 2.78, 2.84), ('9', 2.84, 2.9), ('12', 2.9

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.6330645161290323


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6370967741935484


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.18].wav', 'new_audio[2.18, 3.74].wav', 'new_audio[3.74, 5.42].wav', 'new_audio[5.42, 6.14].wav', 'new_audio[6.14, 7.76].wav', 'new_audio[7.76, 9.14].wav', 'new_audio[9.14, 10.1].wav'], [('5', 0, 2.18), ('2', 2.18, 3.74), ('3', 3.74, 5.42), ('5', 5.42, 6.14), ('1', 6.14, 7.76), ('0', 7.76, 9.14), ('4', 9.14, 10.1)]]




0.6411290322580645


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.92].wav', 'new_audio[3.92, 5.36].wav', 'new_audio[5.36, 7.46].wav'], [('3', 0, 1.88), ('0', 1.88, 3.92), ('2', 3.92, 5.36), ('1', 5.36, 7.46)]]




0.6451612903225806


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.64].wav', 'new_audio[1.64, 2.18].wav', 'new_audio[2.18, 2.42].wav', 'new_audio[2.42, 4.28].wav', 'new_audio[4.28, 5.66].wav', 'new_audio[5.66, 6.92].wav', 'new_audio[6.92, 8.96].wav'], [('2', 0, 1.64), ('3', 1.64, 2.18), ('0', 2.18, 2.42), ('2', 2.42, 4.28), ('3', 4.28, 5.66), ('0', 5.66, 6.92), ('1', 6.92, 8.96)]]




0.6491935483870968


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 0.86].wav', 'new_audio[0.86, 0.92].wav', 'new_audio[0.92, 1.1].wav', 'new_audio[1.1, 1.28].wav', 'new_audio[1.28, 2.0].wav', 'new_audio[2.0, 2.06].wav', 'new_audio[2.06, 2.12].wav', 'new_audio[2.12, 2.18].wav', 'new_audio[2.18, 2.24].wav', 'new_audio[2.24, 2.3].wav', 'new_audio[2.3, 2.36].wav', 'new_audio[2.36, 2.42].wav', 'new_audio[2.42, 2.48].wav', 'new_audio[2.48, 2.6].wav', 'new_audio[2.6, 2.72].wav', 'new_audio[2.72, 2.84].wav', 'new_audio[2.84, 2.9].wav', 'new_audio[2.9, 2.96].wav'], [('16', 0, 0.86), ('7', 0.86, 0.92), ('12', 0.92, 1.1), ('9', 1.1, 1.28), ('2', 1.28, 2.0), ('11', 2.0, 2.06), ('14', 2.06, 2.12), ('15', 2.12, 2.18), ('6', 2.18, 2.24), ('8', 2.24, 2.3), ('13', 2.3, 2.36), ('10', 2.36, 2.42), ('5', 2.42, 2.48), ('0', 2.48, 2.6), ('4', 2.6, 2.72), ('1', 2.72, 2.84), ('17', 2.84, 2.9), ('3', 2.9, 2.96)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.6532258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.94].wav', 'new_audio[1.94, 2.42].wav'], [('0', 0, 1.94), ('1', 1.94, 2.42)]]




0.657258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.18].wav', 'new_audio[2.18, 3.86].wav', 'new_audio[3.86, 5.84].wav', 'new_audio[5.84, 6.8].wav', 'new_audio[6.8, 8.06].wav', 'new_audio[8.06, 9.26].wav', 'new_audio[9.26, 10.64].wav', 'new_audio[10.64, 12.38].wav', 'new_audio[12.38, 14.36].wav'], [('1', 0, 2.18), ('8', 2.18, 3.86), ('2', 3.86, 5.84), ('3', 5.84, 6.8), ('6', 6.8, 8.06), ('5', 8.06, 9.26), ('7', 9.26, 10.64), ('0', 10.64, 12.38), ('4', 12.38, 14.36)]]




0.6612903225806451


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.78].wav', 'new_audio[2.78, 4.52].wav', 'new_audio[4.52, 6.62].wav', 'new_audio[6.62, 8.48].wav', 'new_audio[8.48, 11.0].wav'], [('0', 0, 2.78), ('4', 2.78, 4.52), ('2', 4.52, 6.62), ('1', 6.62, 8.48), ('3', 8.48, 11.0)]]




0.6653225806451613


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.6693548387096774


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.03 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.04].wav', 'new_audio[1.04, 1.28].wav'], [('1', 0, 1.04), ('0', 1.04, 1.28)]]




0.6733870967741935


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 10.82].wav', 'new_audio[10.82, 11.0].wav', 'new_audio[11.0, 11.36].wav', 'new_audio[11.36, 16.34].wav', 'new_audio[16.34, 16.58].wav', 'new_audio[16.58, 16.88].wav', 'new_audio[16.88, 18.44].wav', 'new_audio[18.44, 18.62].wav', 'new_audio[18.62, 18.8].wav', 'new_audio[18.8, 22.04].wav'], [('0', 0, 10.82), ('1', 10.82, 11.0), ('0', 11.0, 11.36), ('1', 11.36, 16.34), ('0', 16.34, 16.58), ('1', 16.58, 16.88), ('0', 16.88, 18.44), ('1', 18.44, 18.62), ('0', 18.62, 18.8), ('1', 18.8, 22.04)]]




0.6774193548387096


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.84].wav', 'new_audio[2.84, 4.28].wav', 'new_audio[4.28, 4.46].wav', 'new_audio[4.46, 6.2].wav', 'new_audio[6.2, 6.44].wav', 'new_audio[6.44, 6.98].wav', 'new_audio[6.98, 8.66].wav', 'new_audio[8.66, 9.74].wav', 'new_audio[9.74, 11.24].wav', 'new_audio[11.24, 11.72].wav', 'new_audio[11.72, 13.28].wav', 'new_audio[13.28, 13.76].wav'], [('3', 0, 2.12), ('10', 2.12, 2.84), ('6', 2.84, 4.28), ('1', 4.28, 4.46), ('5', 4.46, 6.2), ('0', 6.2, 6.44), ('8', 6.44, 6.98), ('0', 6.98, 8.66), ('1', 8.66, 9.74), ('9', 9.74, 11.24), ('7', 11.24, 11.72), ('2', 11.72, 13.28), ('4', 13.28, 13.76)]]




0.6814516129032258


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6854838709677419


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.42].wav', 'new_audio[2.42, 3.62].wav', 'new_audio[3.62, 5.42].wav', 'new_audio[5.42, 6.68].wav'], [('2', 0, 2.42), ('0', 2.42, 3.62), ('1', 3.62, 5.42), ('3', 5.42, 6.68)]]




0.6895161290322581


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.6935483870967742


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.18].wav', 'new_audio[2.18, 3.5].wav', 'new_audio[3.5, 5.12].wav', 'new_audio[5.12, 6.68].wav', 'new_audio[6.68, 8.12].wav', 'new_audio[8.12, 9.14].wav'], [('3', 0, 2.18), ('0', 2.18, 3.5), ('4', 3.5, 5.12), ('2', 5.12, 6.68), ('1', 6.68, 8.12), ('5', 8.12, 9.14)]]




0.6975806451612904


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


 0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.66].wav', 'new_audio[2.66, 4.34].wav', 'new_audio[4.34, 5.96].wav', 'new_audio[5.96, 7.88].wav', 'new_audio[7.88, 9.74].wav', 'new_audio[9.74, 11.72].wav', 'new_audio[11.72, 13.58].wav', 'new_audio[13.58, 14.72].wav'], [('0', 0, 2.66), ('7', 2.66, 4.34), ('2', 4.34, 5.96), ('1', 5.96, 7.88), ('6', 7.88, 9.74), ('5', 9.74, 11.72), ('3', 11.72, 13.58), ('4', 13.58, 14.72)]]




0.7016129032258065


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 10.34].wav', 'new_audio[10.34, 11.54].wav', 'new_audio[11.54, 11.9].wav', 'new_audio[11.9, 21.74].wav', 'new_audio[21.74, 21.8].wav', 'new_audio[21.8, 21.86].wav', 'new_audio[21.86, 21.98].wav', 'new_audio[21.98, 28.28].wav'], [('1', 0, 10.34), ('0', 10.34, 11.54), ('1', 11.54, 11.9), ('0', 11.9, 21.74), ('1', 21.74, 21.8), ('0', 21.8, 21.86), ('1', 21.86, 21.98), ('0', 21.98, 28.28)]]


  return f(*args, **kwargs)
  return f(*args, **kwargs)


0.7056451612903226


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 2.06].wav', 'new_audio[2.06, 3.26].wav'], [('0', 0, 1.82), ('2', 1.82, 2.06), ('1', 2.06, 3.26)]]




0.7096774193548387


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.7137096774193549


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.717741935483871


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.7217741935483871


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 1.4].wav', 'new_audio[1.4, 1.94].wav'], [('0', 0, 1.4), ('1', 1.4, 1.94)]]




0.7258064516129032


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.78].wav', 'new_audio[2.78, 8.66].wav', 'new_audio[8.66, 12.14].wav', 'new_audio[12.14, 17.48].wav', 'new_audio[17.48, 18.8].wav', 'new_audio[18.8, 20.78].wav', 'new_audio[20.78, 25.82].wav', 'new_audio[25.82, 25.94].wav'], [('0', 0, 2.78), ('1', 2.78, 8.66), ('0', 8.66, 12.14), ('1', 12.14, 17.48), ('0', 17.48, 18.8), ('1', 18.8, 20.78), ('0', 20.78, 25.82), ('1', 25.82, 25.94)]]




0.7298387096774194


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.72].wav', 'new_audio[2.72, 4.52].wav', 'new_audio[4.52, 6.56].wav', 'new_audio[6.56, 7.64].wav', 'new_audio[7.64, 7.94].wav', 'new_audio[7.94, 8.12].wav', 'new_audio[8.12, 8.48].wav', 'new_audio[8.48, 11.06].wav', 'new_audio[11.06, 12.92].wav', 'new_audio[12.92, 14.48].wav', 'new_audio[14.48, 16.76].wav', 'new_audio[16.76, 16.94].wav'], [('4', 0, 2.72), ('0', 2.72, 4.52), ('1', 4.52, 6.56), ('2', 6.56, 7.64), ('6', 7.64, 7.94), ('2', 7.94, 8.12), ('6', 8.12, 8.48), ('2', 8.48, 11.06), ('6', 11.06, 12.92), ('3', 12.92, 14.48), ('5', 14.48, 16.76), ('6', 16.76, 16.94)]]




0.7338709677419355


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.7379032258064516


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.52].wav', 'new_audio[1.52, 1.82].wav'], [('0', 0, 1.52), ('1', 1.52, 1.82)]]




0.7419354838709677


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.48].wav', 'new_audio[2.48, 4.28].wav', 'new_audio[4.28, 6.2].wav'], [('2', 0, 2.48), ('0', 2.48, 4.28), ('1', 4.28, 6.2)]]




0.7459677419354839


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.76].wav', 'new_audio[1.76, 2.54].wav', 'new_audio[2.54, 3.5].wav', 'new_audio[3.5, 4.16].wav', 'new_audio[4.16, 4.94].wav', 'new_audio[4.94, 5.66].wav', 'new_audio[5.66, 7.04].wav', 'new_audio[7.04, 7.76].wav', 'new_audio[7.76, 8.36].wav'], [('3', 0, 1.76), ('8', 1.76, 2.54), ('4', 2.54, 3.5), ('0', 3.5, 4.16), ('6', 4.16, 4.94), ('5', 4.94, 5.66), ('2', 5.66, 7.04), ('1', 7.04, 7.76), ('7', 7.76, 8.36)]]




0.75


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.7540322580645161


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 2.36].wav', 'new_audio[2.36, 5.36].wav', 'new_audio[5.36, 6.56].wav', 'new_audio[6.56, 7.46].wav'], [('3', 0, 1.82), ('4', 1.82, 2.36), ('1', 2.36, 5.36), ('0', 5.36, 6.56), ('2', 6.56, 7.46)]]




0.7580645161290323


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 3.56].wav', 'new_audio[3.56, 4.52].wav'], [('2', 0, 2.12), ('0', 2.12, 3.56), ('1', 3.56, 4.52)]]




0.7620967741935484


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.28].wav', 'new_audio[1.28, 1.82].wav'], [('1', 0, 1.28), ('0', 1.28, 1.82)]]




0.7661290322580645


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.28].wav', 'new_audio[1.28, 3.08].wav'], [('0', 0, 1.28), ('1', 1.28, 3.08)]]




0.7701612903225806


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.7741935483870968


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 3.32].wav', 'new_audio[3.32, 3.38].wav', 'new_audio[3.38, 5.24].wav'], [('1', 0, 3.02), ('0', 3.02, 3.32), ('1', 3.32, 3.38), ('0', 3.38, 5.24)]]


  return f(*args, **kwargs)


0.7782258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
 0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.34].wav', 'new_audio[1.34, 1.76].wav'], [('0', 0, 1.34), ('1', 1.34, 1.76)]]




0.782258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(
  refined_affinity /= np.expand_dims(row_max, axis=1)


Loaded the voice encoder model on cpu in 0.01 seconds.
Erreur
0.7862903225806451


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.5].wav', 'new_audio[3.5, 4.88].wav', 'new_audio[4.88, 5.06].wav', 'new_audio[5.06, 5.12].wav', 'new_audio[5.12, 6.68].wav'], [('3', 0, 2.0), ('0', 2.0, 3.5), ('2', 3.5, 4.88), ('3', 4.88, 5.06), ('2', 5.06, 5.12), ('1', 5.12, 6.68)]]


  return f(*args, **kwargs)


0.7903225806451613


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.34].wav', 'new_audio[1.34, 1.88].wav'], [('1', 0, 1.34), ('0', 1.34, 1.88)]]




0.7943548387096774


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.28].wav', 'new_audio[1.28, 3.14].wav', 'new_audio[3.14, 4.04].wav', 'new_audio[4.04, 5.66].wav', 'new_audio[5.66, 6.68].wav', 'new_audio[6.68, 7.28].wav', 'new_audio[7.28, 8.6].wav'], [('5', 0, 1.28), ('1', 1.28, 3.14), ('4', 3.14, 4.04), ('2', 4.04, 5.66), ('3', 5.66, 6.68), ('5', 6.68, 7.28), ('0', 7.28, 8.6)]]




0.7983870967741935


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8024193548387096


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8064516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.8104838709677419


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8145161290322581


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8185483870967742


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.8225806451612904


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.84].wav', 'new_audio[2.84, 3.38].wav', 'new_audio[3.38, 3.56].wav', 'new_audio[3.56, 9.2].wav'], [('1', 0, 2.84), ('0', 2.84, 3.38), ('1', 3.38, 3.56), ('0', 3.56, 9.2)]]




0.8266129032258065


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.9].wav', 'new_audio[2.9, 3.14].wav', 'new_audio[3.14, 3.32].wav', 'new_audio[3.32, 5.66].wav', 'new_audio[5.66, 6.68].wav', 'new_audio[6.68, 7.34].wav', 'new_audio[7.34, 8.36].wav'], [('0', 0, 2.9), ('3', 2.9, 3.14), ('0', 3.14, 3.32), ('3', 3.32, 5.66), ('2', 5.66, 6.68), ('4', 6.68, 7.34), ('1', 7.34, 8.36)]]




0.8306451612903226


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8346774193548387


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8387096774193549


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.842741935483871


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8467741935483871


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8508064516129032


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8548387096774194


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8588709677419355


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.08].wav', 'new_audio[3.08, 11.54].wav', 'new_audio[11.54, 14.24].wav'], [('1', 0, 3.08), ('0', 3.08, 11.54), ('1', 11.54, 14.24)]]




0.8629032258064516


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8669354838709677


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8709677419354839


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.875


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.8790322580645161


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8830645161290323


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8870967741935484


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8911290322580645


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.8951612903225806


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.8991935483870968


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.9032258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.72].wav', 'new_audio[2.72, 5.06].wav', 'new_audio[5.06, 6.8].wav', 'new_audio[6.8, 8.18].wav', 'new_audio[8.18, 8.96].wav', 'new_audio[8.96, 10.1].wav', 'new_audio[10.1, 11.12].wav'], [('1', 0, 2.72), ('2', 2.72, 5.06), ('6', 5.06, 6.8), ('3', 6.8, 8.18), ('5', 8.18, 8.96), ('4', 8.96, 10.1), ('0', 10.1, 11.12)]]




0.907258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.18].wav', 'new_audio[2.18, 2.9].wav', 'new_audio[2.9, 3.5].wav', 'new_audio[3.5, 4.52].wav', 'new_audio[4.52, 5.48].wav', 'new_audio[5.48, 6.38].wav', 'new_audio[6.38, 7.76].wav', 'new_audio[7.76, 8.9].wav', 'new_audio[8.9, 11.0].wav', 'new_audio[11.0, 12.26].wav', 'new_audio[12.26, 13.94].wav', 'new_audio[13.94, 15.32].wav', 'new_audio[15.32, 17.42].wav', 'new_audio[17.42, 18.62].wav', 'new_audio[18.62, 20.06].wav', 'new_audio[20.06, 21.44].wav', 'new_audio[21.44, 22.46].wav', 'new_audio[22.46, 23.66].wav', 'new_audio[23.66, 24.62].wav', 'new_audio[24.62, 24.92].wav', 'new_audio[24.92, 25.76].wav', 'new_audio[25.76, 27.62].wav', 'new_audio[27.62, 29.66].wav', 'new_audio[29.66, 29.78].wav', 'new_audio[29.78, 30.62].wav'], [('7', 0, 2.18), ('8', 2.18, 2.9), ('0', 2.9, 3.5), ('11', 3.5, 4.52), ('8', 4.52, 5.48), ('15', 5.48, 6.38), ('5', 6.38, 7.76), ('16', 7.76, 8.9), ('10', 8.9, 11.0), ('13', 11.0, 12.26), ('6', 12.26, 13.94), ('17', 13.94, 15.32), ('4', 15.32, 17.42)





0.9112903225806451


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.9153225806451613


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.9193548387096774


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.7].wav', 'new_audio[1.7, 2.96].wav', 'new_audio[2.96, 4.34].wav', 'new_audio[4.34, 6.32].wav', 'new_audio[6.32, 8.0].wav', 'new_audio[8.0, 8.72].wav'], [('4', 0, 1.7), ('2', 1.7, 2.96), ('5', 2.96, 4.34), ('1', 4.34, 6.32), ('3', 6.32, 8.0), ('0', 8.0, 8.72)]]




0.9233870967741935


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.9274193548387096


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.7].wav', 'new_audio[1.7, 3.2].wav', 'new_audio[3.2, 4.34].wav'], [('2', 0, 1.7), ('1', 1.7, 3.2), ('0', 3.2, 4.34)]]




0.9314516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.9354838709677419


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 7.16].wav', 'new_audio[7.16, 19.76].wav', 'new_audio[19.76, 20.06].wav', 'new_audio[20.06, 20.18].wav', 'new_audio[20.18, 20.3].wav', 'new_audio[20.3, 21.02].wav', 'new_audio[21.02, 21.14].wav', 'new_audio[21.14, 23.24].wav', 'new_audio[23.24, 23.36].wav', 'new_audio[23.36, 24.56].wav'], [('1', 0, 7.16), ('0', 7.16, 19.76), ('1', 19.76, 20.06), ('0', 20.06, 20.18), ('1', 20.18, 20.3), ('0', 20.3, 21.02), ('1', 21.02, 21.14), ('0', 21.14, 23.24), ('1', 23.24, 23.36), ('0', 23.36, 24.56)]]




0.9395161290322581


  return np.vstack(
  return np.vstack(


[[], [-1]]
0.9435483870967742


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.64].wav', 'new_audio[1.64, 3.26].wav', 'new_audio[3.26, 4.16].wav', 'new_audio[4.16, 4.94].wav', 'new_audio[4.94, 6.56].wav', 'new_audio[6.56, 7.82].wav', 'new_audio[7.82, 8.72].wav', 'new_audio[8.72, 9.74].wav', 'new_audio[9.74, 11.12].wav', 'new_audio[11.12, 12.2].wav', 'new_audio[12.2, 12.26].wav', 'new_audio[12.26, 13.46].wav'], [('10', 0, 1.64), ('2', 1.64, 3.26), ('3', 3.26, 4.16), ('8', 4.16, 4.94), ('1', 4.94, 6.56), ('7', 6.56, 7.82), ('5', 7.82, 8.72), ('9', 8.72, 9.74), ('4', 9.74, 11.12), ('0', 11.12, 12.2), ('1', 12.2, 12.26), ('6', 12.26, 13.46)]]


  return f(*args, **kwargs)


0.9475806451612904


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.5].wav', 'new_audio[3.5, 3.92].wav', 'new_audio[3.92, 4.16].wav', 'new_audio[4.16, 4.46].wav', 'new_audio[4.46, 6.26].wav', 'new_audio[6.26, 14.3].wav', 'new_audio[14.3, 24.26].wav', 'new_audio[24.26, 26.0].wav', 'new_audio[26.0, 26.36].wav', 'new_audio[26.36, 26.48].wav', 'new_audio[26.48, 48.62].wav'], [('0', 0, 3.5), ('1', 3.5, 3.92), ('0', 3.92, 4.16), ('1', 4.16, 4.46), ('0', 4.46, 6.26), ('1', 6.26, 14.3), ('0', 14.3, 24.26), ('1', 24.26, 26.0), ('0', 26.0, 26.36), ('1', 26.36, 26.48), ('0', 26.48, 48.62)]]




0.9516129032258065


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 4.4].wav', 'new_audio[4.4, 6.2].wav', 'new_audio[6.2, 8.0].wav', 'new_audio[8.0, 8.36].wav', 'new_audio[8.36, 9.98].wav', 'new_audio[9.98, 12.08].wav', 'new_audio[12.08, 13.28].wav', 'new_audio[13.28, 14.96].wav', 'new_audio[14.96, 15.98].wav'], [('0', 0, 4.4), ('2', 4.4, 6.2), ('7', 6.2, 8.0), ('2', 8.0, 8.36), ('4', 8.36, 9.98), ('3', 9.98, 12.08), ('6', 12.08, 13.28), ('1', 13.28, 14.96), ('5', 14.96, 15.98)]]




0.9556451612903226


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.02].wav', 'new_audio[3.02, 5.18].wav', 'new_audio[5.18, 6.56].wav', 'new_audio[6.56, 8.0].wav', 'new_audio[8.0, 9.26].wav'], [('5', 0, 1.88), ('4', 1.88, 3.02), ('2', 3.02, 5.18), ('0', 5.18, 6.56), ('1', 6.56, 8.0), ('3', 8.0, 9.26)]]




0.9596774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.9637096774193549


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.42].wav', 'new_audio[2.42, 5.42].wav', 'new_audio[5.42, 6.62].wav', 'new_audio[6.62, 7.58].wav', 'new_audio[7.58, 9.2].wav', 'new_audio[9.2, 10.04].wav', 'new_audio[10.04, 11.0].wav', 'new_audio[11.0, 12.62].wav', 'new_audio[12.62, 13.82].wav', 'new_audio[13.82, 14.24].wav', 'new_audio[14.24, 15.32].wav', 'new_audio[15.32, 17.48].wav', 'new_audio[17.48, 19.04].wav', 'new_audio[19.04, 19.64].wav'], [('10', 0, 2.42), ('4', 2.42, 5.42), ('9', 5.42, 6.62), ('7', 6.62, 7.58), ('0', 7.58, 9.2), ('3', 9.2, 10.04), ('1', 10.04, 11.0), ('11', 11.0, 12.62), ('6', 12.62, 13.82), ('3', 13.82, 14.24), ('12', 14.24, 15.32), ('2', 15.32, 17.48), ('8', 17.48, 19.04), ('5', 19.04, 19.64)]]




0.967741935483871


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.04].wav', 'new_audio[1.04, 2.48].wav'], [('1', 0, 1.04), ('0', 1.04, 2.48)]]




0.9717741935483871


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.76].wav', 'new_audio[1.76, 2.9].wav', 'new_audio[2.9, 4.1].wav', 'new_audio[4.1, 5.42].wav'], [('1', 0, 1.76), ('2', 1.76, 2.9), ('3', 2.9, 4.1), ('0', 4.1, 5.42)]]




0.9758064516129032


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 2.06].wav', 'new_audio[2.06, 2.84].wav'], [('0', 0, 2.06), ('1', 2.06, 2.84)]]




0.9798387096774194


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.94].wav', 'new_audio[1.94, 3.68].wav', 'new_audio[3.68, 5.54].wav', 'new_audio[5.54, 7.1].wav', 'new_audio[7.1, 7.88].wav'], [('3', 0, 1.94), ('1', 1.94, 3.68), ('0', 3.68, 5.54), ('2', 5.54, 7.1), ('4', 7.1, 7.88)]]




0.9838709677419355


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.78].wav', 'new_audio[2.78, 3.74].wav', 'new_audio[3.74, 5.66].wav'], [('1', 0, 2.12), ('0', 2.12, 2.78), ('1', 2.78, 3.74), ('0', 3.74, 5.66)]]




0.9879032258064516


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 2.66].wav', 'new_audio[2.66, 4.1].wav', 'new_audio[4.1, 5.42].wav', 'new_audio[5.42, 6.5].wav', 'new_audio[6.5, 7.76].wav', 'new_audio[7.76, 8.78].wav', 'new_audio[8.78, 9.8].wav'], [('4', 0, 2.0), ('6', 2.0, 2.66), ('3', 2.66, 4.1), ('2', 4.1, 5.42), ('7', 5.42, 6.5), ('1', 6.5, 7.76), ('0', 7.76, 8.78), ('5', 8.78, 9.8)]]




0.9919354838709677


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
0.9959677419354839


In [6]:
res3

{'100': {'M': 2, 'F': 7},
 '1065': {'M': 1, 'F': 3},
 '1156': {'M': 0, 'F': 7},
 '1162': {'M': 0, 'F': 0},
 '1167': {'M': 5, 'F': 1},
 '1226': {'M': 0, 'F': 2},
 '125': {'M': 0, 'F': 0},
 '126': {'M': 0, 'F': 0},
 '127': {'M': 0, 'F': 0},
 '1310': {'M': 2, 'F': 2},
 '1313': {'M': 1, 'F': 3},
 '1327': {'M': 0, 'F': 3},
 '137': {'M': 0, 'F': 0},
 '15': {'M': 0, 'F': 0},
 '1508': {'M': 7, 'F': 3},
 '1532': {'M': 4, 'F': 3},
 '1551': {'M': 7, 'F': 3},
 '156': {'M': 5, 'F': 3},
 '157': {'M': 8, 'F': 1},
 '161': {'M': 0, 'F': 0},
 '1666': {'M': 7, 'F': 3},
 '1727': {'M': 2, 'F': 1},
 '173': {'M': 1, 'F': 4},
 '1730': {'M': 4, 'F': 0},
 '1731': {'M': 2, 'F': 0},
 '1754': {'M': 0, 'F': 0},
 '1757': {'M': 3, 'F': 5},
 '1758': {'M': 12, 'F': 9},
 '1770': {'M': 2, 'F': 0},
 '1777': {'M': 6, 'F': 0},
 '1819': {'M': 6, 'F': 2},
 '1826': {'M': 2, 'F': 0},
 '1827': {'M': 2, 'F': 0},
 '1832': {'M': 4, 'F': 1},
 '1875': {'M': 5, 'F': 6},
 '1892': {'M': 1, 'F': 1},
 '1976': {'M': 4, 'F': 5},
 '1990': {'

In [7]:
tab=pd.DataFrame(res3).to_csv('Equipe 2 - Résultats-2.csv',';')