In [None]:
"""
## Install dependencies
! pip install wget
! apt-get install sox libsndfile1 ffmpeg
! pip install text-unidecode

# ## Install NeMo
BRANCH = 'r1.20.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

## Install TorchAudio
! pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html

"""

In [None]:
audio_path = 'XXX'

In [None]:
from whisper import load_model

# Large models result in considerably better and more aligned (words, timestamps) mapping. 
model = load_model("large-v2")

# Beam size if None by default (Greedy Decoding). You can also set the
# beam_size to some number like 5. This will increase in better transcription
# quality but it'll increase runtime considerabley.
results = model.transcribe('./audio_16k.wav', beam_size=None)

In [None]:
import whisperx

device = 'cuda'
model_a, metadata = whisperx.load_align_model(language_code=results["language"], device=device)
whisperx.align(results["segments"], model_a, metadata, './audio_16k.wav', device, 0.0, False, False)

In [None]:
"""
You WhisperX alignment may fail. If this happens then it's most probably because Whisper just hallucinated i.e. whisper came up with extra/weird output at the end.

This usually happens with long audio files. If this happens, I'd suggest splitting big audio files in small files.

# Storing words <> timestamps mapping in a file.

import json

with open('./word_ts.text', 'w+') as f:
    for result in results['segments']:
      for line in result['word-level']:
        line_temp = line.copy()
        # WhisperX don't put a space after word but just to make sure.
        line_temp['text'] = line_temp['text'].strip()
        f.write(f'{json.dumps(line_temp)}\n')


"""

In [None]:
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
from nemo.collections.asr.models.msdd_models import ClusteringDiarizer

import numpy as np
from IPython.display import Audio, display
import librosa
import os
import wget
import matplotlib.pyplot as plt

import nemo
import glob

from omegaconf import OmegaConf
import shutil

import pprint
pp = pprint.PrettyPrinter(indent=4)

In [None]:

signal, sr = librosa.load(audio_path,sr=None) 

In [None]:
def display_waveform(signal,text='Audio',overlay_color=[]):
    fig,ax = plt.subplots(1,1)
    fig.set_figwidth(20)
    fig.set_figheight(2)
    plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c='k')
    if len(overlay_color):
        plt.scatter(np.arange(len(signal)),signal,s=1,marker='o',c=overlay_color)
    fig.suptitle(text, fontsize=16)
    plt.xlabel('time (secs)', fontsize=18)
    plt.ylabel('signal strength', fontsize=14);
    plt.axis([0,len(signal),-0.5,+0.5])
    time_axis,_ = plt.xticks();
    plt.xticks(time_axis[:-1],time_axis[:-1]/sample_rate);
    
COLORS="b g c m y".split()

def get_color(signal,speech_labels,sample_rate=16000):
    c=np.array(['k']*len(signal))
    for time_stamp in speech_labels:
        start,end,label=time_stamp.split()
        start,end = int(float(start)*16000),int(float(end)*16000),
        if label == "speech":
            code = 'red'
        else:
            code = COLORS[int(label.split('_')[-1])]
        c[start:end]=code
    
    return c 

In [None]:
display_waveform(signal)

# Parameters for NeMo Speaker Diarization

In [None]:

DOMAIN_TYPE = 'meeting'

CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"

ROOT = os.getcwd()
data_dir = os.path.join(ROOT,'data')

if not os.path.exists(os.path.join(data_dir,CONFIG_FILE_NAME)):
    CONFIG = wget.download(CONFIG_URL, data_dir)
else:
    CONFIG = os.path.join(data_dir,CONFIG_FILE_NAME)

cfg = OmegaConf.load(CONFIG)

In [None]:
import json

meta = {
    'audio_filepath': audio_path, 
    'offset': 0, 
    'duration':None, 
    'label': 'infer', 
    'text': '-', 
    'num_speakers': None, 
    'rttm_filepath': None, 
    'uem_filepath' : None
}

with open(os.path.join(data_dir,'input_manifest.json'),'w') as fp:
    json.dump(meta,fp)
    fp.write('\n')

cfg.diarizer.manifest_filepath = os.path.join(data_dir,'input_manifest.json')

In [None]:
cfg.num_workers = 4
cfg.batch_size = 32

cfg.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5, 1.0, 0.5]
cfg.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75, 0.5, 0.25]
cfg.diarizer.speaker_embeddings.parameters.multiscale_weights = [0.33, 0.33, 0.33]

pretrained_speaker_model='titanet_large'
cfg.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
cfg.diarizer.clustering.parameters.oracle_num_speakers=False
cfg.diarizer.out_dir = data_dir

cfg.diarizer.ignore_overlap = False
cfg.diarizer.oracle_vad = False
cfg.diarizer.collar = 0.25

cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
cfg.diarizer.oracle_vad = False # ----> Not using oracle VAD 

asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
model = ClusteringDiarizer(cfg=config)

In [None]:
model.diarize()