In [1]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
from spectralcluster import SpectralClusterer
from spectralcluster import RefinementOptions
from spectralcluster import ThresholdType
from spectralcluster import ICASSP2018_REFINEMENT_SEQUENCE
from resemblyzer.audio import sampling_rate
from pydub import AudioSegment
import numpy as np
import os
import joblib
import librosa
import pandas as pd

## Définition de la fonction générale

In [2]:
def count_speakers1(audio_path):
    
    def extract_speech(audio_path):
        segmentation = Segmenter(detect_gender=False)
        return segmentation(audio_path)
    
    def concatenate_segments(segmentation):
        t1 = [] #Temps de début de parole
        t2 = [] #Temps de fin de parole

        #Remplissage des tableaux précédents
        for i in range(len(segmentation)):
            #if (segmentation[i][0] == 'male') or (segmentation[i][0] == 'female'):
            if segmentation[i][0] == 'speech':
                t1.append(segmentation[i][1])
                t2.append(segmentation[i][2])

        #Importation du fichier .wav dont on veut les différents locuteurs
        newAudio = AudioSegment.from_wav(audio_path)
        #Génération de deux fichiers audio vides
        audio = AudioSegment.empty()

        # Génération fichier
        for i in range(len(t1)):
            audio += newAudio[t1[i]*1000:t2[i]*1000]
        audio.export(out_f = "cut_audio.wav", 
                            format = "wav")
        return "cut_audio.wav"
    
    def speaker_segmentation(audio):
        labelling = []
        if os.stat(audio).st_size > 100000 :
            wav_fpath = Path(audio)
            wav = preprocess_wav(wav_fpath)
            encoder = VoiceEncoder("cpu")
            _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

            # Pour déterminer les différents locuteurs/locutrices, on clusterise
            refinement_options = RefinementOptions(gaussian_blur_sigma=1,
                                                    p_percentile=0.90,
                                                    thresholding_soft_multiplier=0.01,
                                                    thresholding_type=ThresholdType.RowMax,
                                                    refinement_sequence=ICASSP2018_REFINEMENT_SEQUENCE)

            clusterer = SpectralClusterer(min_clusters=2,
                                          max_clusters=100,
                                          refinement_options=refinement_options)

            labels = clusterer.predict(cont_embeds)

            times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
            start_time = 0

            for i,time in enumerate(times):
                if i>0 and labels[i]!=labels[i-1]:
                    temp = [str(labels[i-1]),start_time,time]
                    labelling.append(tuple(temp))
                    start_time = time
                if i==len(times)-1:
                    temp = [str(labels[i]),start_time,time]
                    labelling.append(tuple(temp))
        else:
            labelling.append(-1)
        return labelling
    
    def extract_audios(labelling):
        audio = AudioSegment.from_wav(audio_path)
        L=[]
        for t in labelling:
            if not(t == -1):
                new_audio = AudioSegment.empty()
                new_audio = audio[t[1]*1000:t[2]*1000]
                out_f= "new_audio"+str([t[1],t[2]])+".wav"
                new_audio.export(out_f , format = "wav")
                L.append(out_f)
        return(L)
    
    def different_speakers(audio_path):
    
        ## Exécution partie 1 : 
        # Etape I : 
        segmentation = extract_speech(audio_path)
        # Etape II : 
        newAudio = concatenate_segments(segmentation)
        # Etape III :
        labelling = speaker_segmentation(newAudio)
        # Etape IV : 
        resultat = extract_audios(labelling)
            
        return [resultat, labelling]

    def get_char_son(fichier):
        signal, sr=librosa.load(fichier)
        S = np.abs(librosa.stft(signal))
        comps, acts = librosa.decompose.decompose(S, n_components=1)
        data=pd.DataFrame([np.ravel(comps).tolist()])
        data['ID']=fichier
        return(data.set_index('ID'))
    
    def predict_sound(tab):
        loaded_model = joblib.load('Reconnaissance Femme_Homme\my_model.pkl')
        if loaded_model.predict(tab)[0]==0:
            return('Homme')
        return('Femme')
    
    part1 = different_speakers(audio_path)
    print(part1)
    files = part1[0]
    labelling = part1[1]
    dict_speaker = {}
    dict_speaker_sex = {}
    for i in range(len(files)):
        tab = get_char_son(files[i])
        pred_sex = predict_sound(tab)
        try:
            dict_speaker[labelling[i]] = 1 + dict_speaker[labelling[i]]
        except:
            dict_speaker[labelling[i]] = 1
        dict_speaker_sex[labelling[i]] = pred_sex

    c_male = 0
    c_female = 0 
    for cle,valeur in dict_speaker_sex.items():
        if valeur == 'Homme':
            c_male = c_male + dict_speaker[cle]
        elif valeur == 'Femme':
            c_female = c_female + dict_speaker[cle]

    result = {'M':c_male , 'F':c_female}
    return result
    

In [3]:
count_speakers1("moviesoundclips.net_test/moviesoundclips.net/959.wav")

  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.08 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 2.66].wav', 'new_audio[2.66, 4.1].wav', 'new_audio[4.1, 5.42].wav', 'new_audio[5.42, 6.5].wav', 'new_audio[6.5, 7.76].wav', 'new_audio[7.76, 8.78].wav', 'new_audio[8.78, 9.8].wav'], [('4', 0, 2.0), ('6', 2.0, 2.66), ('3', 2.66, 4.1), ('2', 4.1, 5.42), ('7', 5.42, 6.5), ('1', 6.5, 7.76), ('0', 7.76, 8.78), ('5', 8.78, 9.8)]]




{'M': 8, 'F': 0}

## Liste des fichiers à tester

In [4]:
import os
res={}
L=os.listdir('moviesoundclips.net_test/moviesoundclips.net/')
print(L)

['100.wav', '1065.wav', '1156.wav', '1162.wav', '1167.wav', '1226.wav', '125.wav', '126.wav', '127.wav', '1310.wav', '1313.wav', '1327.wav', '137.wav', '15.wav', '1508.wav', '1532.wav', '1551.wav', '156.wav', '157.wav', '161.wav', '1666.wav', '1727.wav', '173.wav', '1730.wav', '1731.wav', '1754.wav', '1757.wav', '1758.wav', '1770.wav', '1777.wav', '1819.wav', '1826.wav', '1827.wav', '1832.wav', '1875.wav', '1892.wav', '1976.wav', '1990.wav', '1994.wav', '1996.wav', '2062.wav', '2071.wav', '2087.wav', '2097.wav', '2129.wav', '2134.wav', '2159.wav', '2163.wav', '2164.wav', '2191.wav', '2195.wav', '220.wav', '2213.wav', '2296.wav', '2317.wav', '2322.wav', '2375.wav', '2382.wav', '2389.wav', '2392.wav', '2395.wav', '240.wav', '2406.wav', '2428.wav', '2430.wav', '2453.wav', '2455.wav', '2458.wav', '2463.wav', '2470.wav', '2481.wav', '2538.wav', '2542.wav', '2546.wav', '2553.wav', '2569.wav', '2636.wav', '2667.wav', '2693.wav', '2696.wav', '2704.wav', '2724.wav', '2751.wav', '2753.wav', '280

# Test sur le jeu de données

In [None]:
res3={}
for i in range(len(L)):
    sep=(L[i].find('.'))
    num=L[i][:sep]
    try:
        res_prov=count_speakers1("moviesoundclips.net_test/moviesoundclips.net/"+L[i])
        res3[str(num)]=res_prov
    except:
        print('Erreur')
    print(i/len(L))

  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.84].wav', 'new_audio[2.84, 3.2].wav', 'new_audio[3.2, 4.34].wav', 'new_audio[4.34, 6.08].wav', 'new_audio[6.08, 6.62].wav', 'new_audio[6.62, 7.94].wav', 'new_audio[7.94, 8.84].wav', 'new_audio[8.84, 10.64].wav'], [('4', 0, 1.46), ('6', 1.46, 2.84), ('7', 2.84, 3.2), ('0', 3.2, 4.34), ('1', 4.34, 6.08), ('7', 6.08, 6.62), ('2', 6.62, 7.94), ('5', 7.94, 8.84), ('3', 8.84, 10.64)]]




Erreur
0.0


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 5.42].wav', 'new_audio[5.42, 6.2].wav', 'new_audio[6.2, 9.32].wav', 'new_audio[9.32, 15.74].wav'], [('1', 0, 5.42), ('0', 5.42, 6.2), ('1', 6.2, 9.32), ('0', 9.32, 15.74)]]




Erreur
0.004032258064516129


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.94].wav', 'new_audio[1.94, 3.26].wav', 'new_audio[3.26, 4.58].wav', 'new_audio[4.58, 5.96].wav', 'new_audio[5.96, 7.76].wav', 'new_audio[7.76, 7.82].wav', 'new_audio[7.82, 9.5].wav'], [('1', 0, 1.94), ('5', 1.94, 3.26), ('0', 3.26, 4.58), ('4', 4.58, 5.96), ('3', 5.96, 7.76), ('0', 7.76, 7.82), ('2', 7.82, 9.5)]]


  return f(*args, **kwargs)


Erreur
0.008064516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
Erreur
0.012096774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.03 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.24].wav', 'new_audio[2.24, 3.14].wav', 'new_audio[3.14, 4.34].wav', 'new_audio[4.34, 5.48].wav', 'new_audio[5.48, 7.34].wav', 'new_audio[7.34, 8.3].wav'], [('3', 0, 2.24), ('5', 2.24, 3.14), ('0', 3.14, 4.34), ('4', 4.34, 5.48), ('1', 5.48, 7.34), ('2', 7.34, 8.3)]]




Erreur
0.016129032258064516


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.44].wav'], [('1', 0, 2.0), ('0', 2.0, 3.44)]]




Erreur
0.020161290322580645


  return np.vstack(
  return np.vstack(


[[], [-1]]
Erreur
0.024193548387096774


  return np.vstack(
  return np.vstack(


[[], [-1]]
Erreur
0.028225806451612902


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
Erreur
0.03225806451612903


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.72].wav', 'new_audio[2.72, 3.14].wav', 'new_audio[3.14, 4.52].wav'], [('2', 0, 2.12), ('0', 2.12, 2.72), ('3', 2.72, 3.14), ('1', 3.14, 4.52)]]




Erreur
0.036290322580645164


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 3.38].wav', 'new_audio[3.38, 5.0].wav', 'new_audio[5.0, 6.62].wav'], [('3', 0, 2.0), ('1', 2.0, 3.38), ('2', 3.38, 5.0), ('0', 5.0, 6.62)]]




Erreur
0.04032258064516129


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.9].wav', 'new_audio[2.9, 4.04].wav'], [('0', 0, 2.12), ('2', 2.12, 2.9), ('1', 2.9, 4.04)]]




Erreur
0.04435483870967742


  return np.vstack(
  return np.vstack(


[[], [-1]]
Erreur
0.04838709677419355


  return np.vstack(
  return np.vstack(


[[], [-1]]
Erreur
0.05241935483870968


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.96].wav', 'new_audio[2.96, 3.56].wav', 'new_audio[3.56, 4.52].wav', 'new_audio[4.52, 6.02].wav', 'new_audio[6.02, 7.1].wav', 'new_audio[7.1, 7.76].wav', 'new_audio[7.76, 9.5].wav', 'new_audio[9.5, 10.22].wav', 'new_audio[10.22, 11.9].wav'], [('6', 0, 1.46), ('5', 1.46, 2.96), ('0', 2.96, 3.56), ('8', 3.56, 4.52), ('2', 4.52, 6.02), ('7', 6.02, 7.1), ('1', 7.1, 7.76), ('3', 7.76, 9.5), ('1', 9.5, 10.22), ('4', 10.22, 11.9)]]




Erreur
0.056451612903225805


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 4.46].wav', 'new_audio[4.46, 5.48].wav', 'new_audio[5.48, 7.52].wav', 'new_audio[7.52, 7.76].wav', 'new_audio[7.76, 9.5].wav', 'new_audio[9.5, 10.7].wav'], [('5', 0, 3.02), ('0', 3.02, 4.46), ('4', 4.46, 5.48), ('1', 5.48, 7.52), ('0', 7.52, 7.76), ('2', 7.76, 9.5), ('3', 9.5, 10.7)]]




Erreur
0.06048387096774194


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.82].wav', 'new_audio[1.82, 3.56].wav', 'new_audio[3.56, 4.76].wav', 'new_audio[4.76, 5.3].wav', 'new_audio[5.3, 6.74].wav', 'new_audio[6.74, 8.54].wav', 'new_audio[8.54, 10.28].wav', 'new_audio[10.28, 11.72].wav', 'new_audio[11.72, 13.16].wav', 'new_audio[13.16, 15.38].wav'], [('4', 0, 1.82), ('1', 1.82, 3.56), ('8', 3.56, 4.76), ('2', 4.76, 5.3), ('6', 5.3, 6.74), ('7', 6.74, 8.54), ('0', 8.54, 10.28), ('3', 10.28, 11.72), ('5', 11.72, 13.16), ('2', 13.16, 15.38)]]




Erreur
0.06451612903225806


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.7].wav', 'new_audio[1.7, 2.3].wav', 'new_audio[2.3, 3.08].wav', 'new_audio[3.08, 4.82].wav', 'new_audio[4.82, 6.32].wav', 'new_audio[6.32, 7.52].wav', 'new_audio[7.52, 8.06].wav', 'new_audio[8.06, 10.04].wav'], [('5', 0, 1.7), ('6', 1.7, 2.3), ('0', 2.3, 3.08), ('3', 3.08, 4.82), ('2', 4.82, 6.32), ('4', 6.32, 7.52), ('7', 7.52, 8.06), ('1', 8.06, 10.04)]]




Erreur
0.06854838709677419


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.0000000e+00  0.0000000e+00], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.46].wav', 'new_audio[1.46, 2.06].wav', 'new_audio[2.06, 3.38].wav', 'new_audio[3.38, 4.76].wav', 'new_audio[4.76, 6.32].wav', 'new_audio[6.32, 7.34].wav', 'new_audio[7.34, 7.58].wav', 'new_audio[7.58, 8.84].wav', 'new_audio[8.84, 10.1].wav'], [('2', 0, 1.46), ('0', 1.46, 2.06), ('4', 2.06, 3.38), ('5', 3.38, 4.76), ('1', 4.76, 6.32), ('6', 6.32, 7.34), ('0', 7.34, 7.58), ('7', 7.58, 8.84), ('3', 8.84, 10.1)]]




Erreur
0.07258064516129033


  return np.vstack(
  return np.vstack(


[[], [-1]]
Erreur
0.07661290322580645


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 3.92].wav', 'new_audio[3.92, 5.18].wav', 'new_audio[5.18, 6.8].wav', 'new_audio[6.8, 6.92].wav', 'new_audio[6.92, 8.84].wav', 'new_audio[8.84, 10.52].wav', 'new_audio[10.52, 12.2].wav', 'new_audio[12.2, 13.4].wav', 'new_audio[13.4, 15.38].wav'], [('6', 0, 1.88), ('1', 1.88, 3.92), ('5', 3.92, 5.18), ('3', 5.18, 6.8), ('8', 6.8, 6.92), ('2', 6.92, 8.84), ('8', 8.84, 10.52), ('0', 10.52, 12.2), ('7', 12.2, 13.4), ('4', 13.4, 15.38)]]




Erreur
0.08064516129032258


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.34].wav', 'new_audio[1.34, 3.26].wav', 'new_audio[3.26, 3.62].wav'], [('2', 0, 1.34), ('1', 1.34, 3.26), ('0', 3.26, 3.62)]]




Erreur
0.0846774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.96].wav', 'new_audio[2.96, 4.34].wav', 'new_audio[4.34, 6.2].wav', 'new_audio[6.2, 6.68].wav', 'new_audio[6.68, 8.78].wav'], [('2', 0, 2.96), ('1', 2.96, 4.34), ('3', 4.34, 6.2), ('1', 6.2, 6.68), ('0', 6.68, 8.78)]]




Erreur
0.08870967741935484


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 5.42].wav', 'new_audio[5.42, 7.82].wav', 'new_audio[7.82, 10.52].wav'], [('3', 0, 3.02), ('1', 3.02, 5.42), ('0', 5.42, 7.82), ('2', 7.82, 10.52)]]




Erreur
0.09274193548387097


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.34].wav', 'new_audio[1.34, 1.7].wav'], [('1', 0, 1.34), ('0', 1.34, 1.7)]]




Erreur
0.0967741935483871


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
Erreur
0.10080645161290322


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 8.96].wav', 'new_audio[8.96, 9.08].wav', 'new_audio[9.08, 14.72].wav', 'new_audio[14.72, 18.92].wav', 'new_audio[18.92, 19.04].wav', 'new_audio[19.04, 19.16].wav', 'new_audio[19.16, 21.74].wav', 'new_audio[21.74, 28.58].wav'], [('0', 0, 8.96), ('1', 8.96, 9.08), ('0', 9.08, 14.72), ('1', 14.72, 18.92), ('0', 18.92, 19.04), ('1', 19.04, 19.16), ('0', 19.16, 21.74), ('1', 21.74, 28.58)]]




Erreur
0.10483870967741936


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.72].wav', 'new_audio[2.72, 4.76].wav', 'new_audio[4.76, 5.12].wav', 'new_audio[5.12, 6.2].wav', 'new_audio[6.2, 8.3].wav', 'new_audio[8.3, 9.98].wav', 'new_audio[9.98, 10.82].wav', 'new_audio[10.82, 13.04].wav', 'new_audio[13.04, 14.96].wav', 'new_audio[14.96, 15.02].wav', 'new_audio[15.02, 15.08].wav', 'new_audio[15.08, 16.94].wav', 'new_audio[16.94, 17.9].wav', 'new_audio[17.9, 19.64].wav', 'new_audio[19.64, 20.36].wav', 'new_audio[20.36, 20.42].wav', 'new_audio[20.42, 20.48].wav', 'new_audio[20.48, 21.44].wav', 'new_audio[21.44, 21.5].wav', 'new_audio[21.5, 21.86].wav', 'new_audio[21.86, 23.66].wav'], [('7', 0, 2.72), ('6', 2.72, 4.76), ('4', 4.76, 5.12), ('9', 5.12, 6.2), ('10', 6.2, 8.3), ('2', 8.3, 9.98), ('4', 9.98, 10.82), ('8', 10.82, 13.04), ('5', 13.04, 14.96), ('6', 14.96, 15.02), ('8', 15.02, 15.08), ('1', 15.08, 16.94), ('4', 16.94, 17.9), ('0', 17.9, 19.64), ('4', 19.64, 20.36), ('1', 20.36, 20.42), ('9', 20.42, 20.48), ('1', 20.48, 21.44), ('3', 21.44,

  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Erreur
0.10887096774193548


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.02].wav', 'new_audio[3.02, 4.76].wav'], [('0', 0, 3.02), ('1', 3.02, 4.76)]]




Erreur
0.11290322580645161


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.0].wav', 'new_audio[2.0, 2.12].wav', 'new_audio[2.12, 2.9].wav', 'new_audio[2.9, 5.24].wav', 'new_audio[5.24, 6.02].wav', 'new_audio[6.02, 7.52].wav'], [('4', 0, 2.0), ('0', 2.0, 2.12), ('3', 2.12, 2.9), ('2', 2.9, 5.24), ('0', 5.24, 6.02), ('1', 6.02, 7.52)]]




Erreur
0.11693548387096774


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Erreur
0.12096774193548387


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.52].wav', 'new_audio[1.52, 3.5].wav'], [('1', 0, 1.52), ('0', 1.52, 3.5)]]




Erreur
0.125


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.04].wav', 'new_audio[1.04, 2.0].wav'], [('1', 0, 1.04), ('0', 1.04, 2.0)]]




Erreur
0.12903225806451613


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.9].wav', 'new_audio[2.9, 6.56].wav', 'new_audio[6.56, 10.82].wav', 'new_audio[10.82, 12.56].wav', 'new_audio[12.56, 13.94].wav'], [('4', 0, 2.9), ('0', 2.9, 6.56), ('1', 6.56, 10.82), ('3', 10.82, 12.56), ('2', 12.56, 13.94)]]




Erreur
0.13306451612903225


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Erreur
0.13709677419354838


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.84].wav', 'new_audio[2.84, 3.98].wav'], [('1', 0, 2.84), ('0', 2.84, 3.98)]]




Erreur
0.14112903225806453


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 2.12].wav', 'new_audio[2.12, 2.78].wav', 'new_audio[2.78, 3.68].wav', 'new_audio[3.68, 5.42].wav', 'new_audio[5.42, 6.44].wav', 'new_audio[6.44, 6.86].wav', 'new_audio[6.86, 7.4].wav', 'new_audio[7.4, 9.14].wav', 'new_audio[9.14, 10.46].wav'], [('2', 0, 2.12), ('3', 2.12, 2.78), ('5', 2.78, 3.68), ('3', 3.68, 5.42), ('6', 5.42, 6.44), ('4', 6.44, 6.86), ('1', 6.86, 7.4), ('7', 7.4, 9.14), ('0', 9.14, 10.46)]]




Erreur
0.14516129032258066


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.36].wav', 'new_audio[2.36, 2.66].wav', 'new_audio[2.66, 3.44].wav', 'new_audio[3.44, 3.5].wav', 'new_audio[3.5, 5.18].wav', 'new_audio[5.18, 5.72].wav', 'new_audio[5.72, 6.5].wav'], [('2', 0, 2.36), ('5', 2.36, 2.66), ('3', 2.66, 3.44), ('5', 3.44, 3.5), ('1', 3.5, 5.18), ('0', 5.18, 5.72), ('4', 5.72, 6.5)]]


  return f(*args, **kwargs)


Erreur
0.14919354838709678


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 3.14].wav', 'new_audio[3.14, 3.2].wav', 'new_audio[3.2, 3.32].wav', 'new_audio[3.32, 4.4].wav', 'new_audio[4.4, 13.04].wav', 'new_audio[13.04, 13.94].wav', 'new_audio[13.94, 18.98].wav', 'new_audio[18.98, 19.52].wav', 'new_audio[19.52, 20.54].wav', 'new_audio[20.54, 21.8].wav', 'new_audio[21.8, 26.48].wav', 'new_audio[26.48, 27.14].wav', 'new_audio[27.14, 31.04].wav', 'new_audio[31.04, 32.66].wav', 'new_audio[32.66, 36.74].wav', 'new_audio[36.74, 38.18].wav', 'new_audio[38.18, 42.98].wav', 'new_audio[42.98, 43.76].wav', 'new_audio[43.76, 46.52].wav', 'new_audio[46.52, 46.64].wav', 'new_audio[46.64, 55.28].wav', 'new_audio[55.28, 61.94].wav', 'new_audio[61.94, 63.68].wav'], [('0', 0, 3.14), ('1', 3.14, 3.2), ('0', 3.2, 3.32), ('1', 3.32, 4.4), ('0', 4.4, 13.04), ('1', 13.04, 13.94), ('0', 13.94, 18.98), ('1', 18.98, 19.52), ('0', 19.52, 20.54), ('1', 20.54, 21.8), ('0', 21.8, 26.48), ('1', 26.48, 27.14), ('0', 27.14, 31.04), ('1', 31.04, 32.66), ('0', 32.66, 36.74), ('1'

  return f(*args, **kwargs)




Erreur
0.1532258064516129


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.03 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.88].wav', 'new_audio[1.88, 2.9].wav', 'new_audio[2.9, 4.58].wav', 'new_audio[4.58, 5.84].wav', 'new_audio[5.84, 6.86].wav'], [('0', 0, 1.88), ('3', 1.88, 2.9), ('2', 2.9, 4.58), ('1', 4.58, 5.84), ('4', 5.84, 6.86)]]




Erreur
0.15725806451612903


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 2.24].wav', 'new_audio[2.24, 3.02].wav'], [('0', 0, 2.24), ('1', 2.24, 3.02)]]




Erreur
0.16129032258064516


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.02 seconds.


  frames = librosa.feature.melspectrogram(


[['new_audio[0, 1.1].wav', 'new_audio[1.1, 2.72].wav', 'new_audio[2.72, 4.46].wav', 'new_audio[4.46, 5.24].wav', 'new_audio[5.24, 5.3].wav', 'new_audio[5.3, 6.98].wav', 'new_audio[6.98, 8.42].wav', 'new_audio[8.42, 9.68].wav', 'new_audio[9.68, 10.7].wav', 'new_audio[10.7, 11.12].wav', 'new_audio[11.12, 11.24].wav', 'new_audio[11.24, 11.54].wav', 'new_audio[11.54, 11.66].wav', 'new_audio[11.66, 13.28].wav', 'new_audio[13.28, 14.36].wav', 'new_audio[14.36, 15.62].wav'], [('4', 0, 1.1), ('0', 1.1, 2.72), ('10', 2.72, 4.46), ('5', 4.46, 5.24), ('6', 5.24, 5.3), ('2', 5.3, 6.98), ('1', 6.98, 8.42), ('3', 8.42, 9.68), ('7', 9.68, 10.7), ('5', 10.7, 11.12), ('0', 11.12, 11.24), ('5', 11.24, 11.54), ('0', 11.54, 11.66), ('6', 11.66, 13.28), ('9', 13.28, 14.36), ('8', 14.36, 15.62)]]


  return f(*args, **kwargs)




Erreur
0.16532258064516128


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.01 seconds.
[['new_audio[0, 1.22].wav', 'new_audio[1.22, 2.72].wav'], [('0', 0, 1.22), ('1', 1.22, 2.72)]]




Erreur
0.1693548387096774


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)


[[], [-1]]
Erreur
0.17338709677419356


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)
  frames = librosa.feature.melspectrogram(


Loaded the voice encoder model on cpu in 0.02 seconds.
[['new_audio[0, 2.6].wav', 'new_audio[2.6, 3.92].wav'], [('1', 0, 2.6), ('0', 2.6, 3.92)]]




Erreur
0.1774193548387097


  return np.vstack(
  return np.vstack(
  data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
  x = asanyarray(arr - arrmean)
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.01 seconds.


  frames = librosa.feature.melspectrogram(


In [None]:
res3

In [None]:
tab=pd.DataFrame(res3).to_csv('Equipe 2 - Résultats-2.csv',';')