In [2]:
pip install resemblyzer

Note: you may need to restart the kernel to use updated packages.


In [1]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
from inaSpeechSegmenter import Segmenter
from inaSpeechSegmenter.export_funcs import seg2csv, seg2textgrid
from spectralcluster import SpectralClusterer
from spectralcluster import RefinementOptions
from spectralcluster import ThresholdType
from spectralcluster import ICASSP2018_REFINEMENT_SEQUENCE
from resemblyzer.audio import sampling_rate
from pydub import AudioSegment
import numpy as np

In [2]:
def different_speakers(audio_path):
  ''' Paramètres
      ----------
      audio_path : string
      Chemin d'accès au fichier .wav à analyser
      '''

  # I - Fonction qui extrait les paroles du reste de l'audio (silence, musique, etc.)
  # Renvoie la segmentation sur la forme [('Speech',0.0,0.14), ('NoEnergy',0.14,0.45), ('Music',0.45,1.12), ...]]
  def extract_speech(audio_path):
    segmentation = Segmenter(detect_gender=False)
    return segmentation(audio_path)

  # Fonction qui renvoie un fichier audio ne contenant que les passages où des personnes parlent
  def concatenate_segments(segmentation):
    t1 = [] #Temps de début de parole
    t2 = [] #Temps de fin de parole

    #Remplissage des tableaux précédents
    for i in range(len(segmentation)):
      #if (segmentation[i][0] == 'male') or (segmentation[i][0] == 'female'):
      if segmentation[i][0] == 'speech':
        t1.append(segmentation[i][1])
        t2.append(segmentation[i][2])

    #Importation du fichier .wav dont on veut les différents locuteurs
    newAudio = AudioSegment.from_wav(audio_path)
    #Génération de deux fichiers audio vides
    audio = AudioSegment.empty()

    # Génération fichier
    for i in range(len(t1)):
      audio += newAudio[t1[i]*1000:t2[i]*1000]

    audio.export(out_f = "cut_audio.wav", 
                          format = "wav")
    
    return "cut_audio.wav"

  # On segmente l'audio selon les différents locuteurs
  # Renvoie 
  def speaker_segmentation(audio):
    wav_fpath = Path(audio)
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

    # Pour déterminer les différents locuteurs/locutrices, on clusterise
    refinement_options = RefinementOptions(gaussian_blur_sigma=1,
                                            p_percentile=0.90,
                                            thresholding_soft_multiplier=0.01,
                                            thresholding_type=ThresholdType.RowMax,
                                            refinement_sequence=ICASSP2018_REFINEMENT_SEQUENCE)

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  refinement_options=refinement_options)
    
    labels = clusterer.predict(cont_embeds)
    
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0

    for i,time in enumerate(times):
        if i>0 and labels[i]!=labels[i-1]:
            temp = [str(labels[i-1]),start_time,time]
            labelling.append(tuple(temp))
            start_time = time
        if i==len(times)-1:
            temp = [str(labels[i]),start_time,time]
            labelling.append(tuple(temp))

    return labelling

  def extract_audios(labelling):
    audio = AudioSegment.from_wav(audio_path)
    L=[]
    for t in labelling:
      new_audio =AudioSegment.empty()
      new_audio= audio[t[1]*1000:t[2]*1000]
      out_f= "new_audio"+str([t[1],t[2]])+".wav"
      new_audio.export(out_f , format = "wav")
      L.append(out_f)
    return(L)

  ## Exécution finale : 
  # Etape I : 
  segmentation = extract_speech(audio_path)
  # Etape II : 
  newAudio = concatenate_segments(segmentation)
  # Etape III :
  labelling = speaker_segmentation(newAudio)
  # Etape IV : 
  resultat = extract_audios(labelling)

  return resultat

In [3]:
## Test : 

different_speakers("Separation_speakers/6.wav")

Downloading data from https://github.com/ina-foss/inaSpeechSegmenter/releases/download/models/keras_speech_music_noise_cnn.hdf5


  return np.vstack(
  return np.vstack(
  wav = librosa.resample(wav, source_sr, sampling_rate)


Loaded the voice encoder model on cpu in 0.11 seconds.


  0.        ], sr=16000 as keyword args. From version 0.10 passing these as positional arguments will result in an error
  frames = librosa.feature.melspectrogram(


['new_audio[0, 2.0].wav',
 'new_audio[2.0, 3.38].wav',
 'new_audio[3.38, 4.34].wav',
 'new_audio[4.34, 5.72].wav',
 'new_audio[5.72, 7.16].wav',
 'new_audio[7.16, 8.72].wav',
 'new_audio[8.72, 10.34].wav']