First we need an audio file to diarize.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# pip install resemblyzer

In [3]:
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path

#give the file path to your audio file
audio_file_path = '/content/drive/MyDrive/NLP Research/audio_data_donald_trump_real_4glfwiMXgwQ.mp3'
wav_fpath = Path(audio_file_path)

wav = preprocess_wav(wav_fpath)
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)
print(cont_embeds.shape)



Loaded the voice encoder model on cpu in 0.03 seconds.
(545, 256)


Sometimes the module might throw an error when you try to use an audio file having file extension different from wav, like mp3. In this case, you might need to convert your mp3 file to wav before using. You can use the below script (you will need to install pydub first)

In [5]:
pip install pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [6]:
from pydub import AudioSegment

def mp3_to_wav(audio_file_path):
    sound = AudioSegment.from_mp3(audio_file_path)
    audio_file_path = audio_file_path.split('.')[0] + '.wav'
    sound.export(audio_file_path, format="wav")
    return audio_file_path

audio_file_path = mp3_to_wav(audio_file_path)
print(audio_file_path)

/content/drive/MyDrive/NLP Research/audio_data_donald_trump_real_4glfwiMXgwQ.wav


Next step is the clustering of our d-vectors. For this, we will use an open source implementation of Spectral Clustering by Quan Wang, one of the original authors of the paper we are implementing, who has been generous enough to provide us with the code. 

In [8]:
pip install spectralcluster

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spectralcluster
  Downloading spectralcluster-0.2.16-py3-none-any.whl (28 kB)
Installing collected packages: spectralcluster
Successfully installed spectralcluster-0.2.16


In [16]:
from spectralcluster import SpectralClusterer

clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=100
    # p_percentile=0.90,
    # gaussian_blur_sigma=1
)

labels = clusterer.predict(cont_embeds)

Creating continuous segments

In [18]:
def create_labelling(labels,wav_splits):
    from resemblyzer.audio import sampling_rate
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0

    for i,time in enumerate(times):
        if i>0 and labels[i]!=labels[i-1]:
            temp = [str(labels[i-1]),start_time,time]
            labelling.append(tuple(temp))
            start_time = time
        if i==len(times)-1:
            temp = [str(labels[i]),start_time,time]
            labelling.append(tuple(temp))

    return labelling
  
labelling = create_labelling(labels,wav_splits)

Using our diarization labels

In [19]:
labelling

[('0', 0, 2.6),
 ('1', 2.6, 4.64),
 ('0', 4.64, 4.76),
 ('1', 4.76, 4.88),
 ('0', 4.88, 6.26),
 ('1', 6.26, 8.12),
 ('0', 8.12, 8.18),
 ('1', 8.18, 8.24),
 ('0', 8.24, 10.58),
 ('1', 10.58, 11.84),
 ('0', 11.84, 12.26),
 ('1', 12.26, 13.28),
 ('0', 13.28, 13.4),
 ('1', 13.4, 13.64),
 ('0', 13.64, 13.76),
 ('1', 13.76, 21.32),
 ('0', 21.32, 21.74),
 ('1', 21.74, 21.8),
 ('0', 21.8, 21.92),
 ('1', 21.92, 22.1),
 ('0', 22.1, 25.52),
 ('1', 25.52, 26.6),
 ('0', 26.6, 30.08),
 ('1', 30.08, 30.68),
 ('0', 30.68, 31.1),
 ('1', 31.1, 31.52),
 ('0', 31.52, 33.44)]

 You can now use these labels to create a text transcription of your audio call 