In [1]:
%%capture --no-stderr

import warnings, logging, os
warnings.filterwarnings("ignore")
for logger_name in ['nemo_logger', 'nemo', 'pytorch_lightning', 'pyannote', 'numba']:
    logging.getLogger(logger_name).setLevel(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


# === 1. Install Dependencies ===
# (Using -q for quiet installation to keep the cell clean)
!pip install wget text-unidecode -q
!apt-get install -y sox libsndfile1 ffmpeg -q
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[asr] -q
!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html -q

In [2]:
# === 2. Imports ===
import glob, json, wget, librosa, pprint
import numpy as np
import matplotlib.pyplot as plt
from omegaconf import OmegaConf
from IPython.display import Audio, display

import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
from nemo.collections.asr.parts.utils.speaker_utils import rttm_to_labels

In [3]:
# === 3. Configuration & Setup ===
# Paths provided by the user
AUDIO_FILENAME = '/kaggle/input/voxconverse-dataset/voxconverse_dev_wav/audio/afjiv.wav'
RTTM_FILEPATH = '/kaggle/input/voxconverse-dataset/labels/dev/afjiv.rttm'

# Setup local workspace
ROOT = os.getcwd()
data_dir = os.path.join(ROOT, 'data')
os.makedirs(data_dir, exist_ok=True)

# Safety check for audio file
if not os.path.exists(AUDIO_FILENAME):
    raise FileNotFoundError(f"CRITICAL: Audio file not found at {AUDIO_FILENAME}")

In [4]:
# === 5. Load Audio ===
print(f"Loading: {os.path.basename(AUDIO_FILENAME)}")
signal, sample_rate = librosa.load(AUDIO_FILENAME, sr=None)

# === 6. Configure Diarization System ===
# Download the standard meeting inference config
CONFIG_URL = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml"
CONFIG_PATH = os.path.join(data_dir, "diar_config.yaml")

if not os.path.exists(CONFIG_PATH):
    CONFIG_PATH = wget.download(CONFIG_URL, data_dir)

cfg = OmegaConf.load(CONFIG_PATH)

# Prepare Manifest with Ground Truth RTTM
meta = {
    'audio_filepath': AUDIO_FILENAME,
    'offset': 0,
    'duration': None,
    'label': 'infer',
    'text': '-',
    'num_speakers': None,
    'rttm_filepath': RTTM_FILEPATH if os.path.exists(RTTM_FILEPATH) else None,
    'uem_filepath': None
}

manifest_path = os.path.join(data_dir, 'input_manifest.json')
with open(manifest_path, 'w') as f:
    json.dump(meta, f)
    f.write('\n')

Loading: afjiv.wav


In [5]:
# High-Accuracy Model Settings
cfg.diarizer.manifest_filepath = manifest_path
cfg.diarizer.out_dir = data_dir
cfg.diarizer.speaker_embeddings.model_path = 'titanet_large'
cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'

# IMPROVEMENT: Upgrading ASR model for better accuracy
cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large' 

cfg.diarizer.oracle_vad = False
cfg.diarizer.asr.parameters.asr_based_vad = False
cfg.diarizer.clustering.parameters.oracle_num_speakers = False

In [6]:
%%capture
# === Run Inference ===
AUDIO_FILE = '/kaggle/input/voxconverse-dataset/voxconverse_dev_wav/audio/afjiv.wav'
RTTM_FILE = '/kaggle/input/voxconverse-dataset/labels/dev/afjiv.rttm'
asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
asr_model = asr_decoder_ts.set_asr_model()
word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)

asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset
diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)

# === Results ===
trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
file_key = os.path.splitext(os.path.basename(AUDIO_FILE))[0]

[NeMo I 2026-01-28 01:58:07 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2026-01-28 01:58:07 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_conformer_ctc_large/versions/1.10.0/files/stt_en_conformer_ctc_large.nemo to /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2026-01-28 01:58:09 common:924] Instantiating model from pre-trained checkpoint
[NeMo I 2026-01-28 01:58:09 mixins:172] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2026-01-28 01:58:09 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_

[NeMo I 2026-01-28 01:58:09 features:289] PADDING: 0
[NeMo I 2026-01-28 01:58:12 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.


[NeMo W 2026-01-28 01:58:12 decoder_timestamps_utils:71] `ctc_decode` was set to True. Note that this is ignored.


[NeMo I 2026-01-28 01:58:12 features:289] PADDING: 0
[NeMo I 2026-01-28 01:58:12 features:289] PADDING: 0
[NeMo I 2026-01-28 01:58:12 decoder_timestamps_utils:656] Running ASR model stt_en_conformer_ctc_large
[NeMo I 2026-01-28 01:58:12 decoder_timestamps_utils:660] [1/1] FrameBatchASR: /kaggle/input/voxconverse-dataset/voxconverse_dev_wav/audio/afjiv.wav
[NeMo I 2026-01-28 01:58:15 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2026-01-28 01:58:15 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2026-01-28 01:58:15 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /root/.cache/torch/NeMo/NeMo_1.23.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2026-01-28 01:58:22 common:924] Instantiating model from pre-trained checkpoint


[NeMo W 2026-01-28 01:58:22 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2026-01-28 01:58:22 features:289] PADDING: 16
[NeMo I 2026-01-28 01:58:22 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2026-01-28 01:58:22 clustering_diarizer:157] Loading pretrained titanet_large model from NGC
[NeMo I 2026-01-28 01:58:22 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_large/versions/v1/files/titanet-l.nemo to /root/.cache/torch/NeMo/NeMo_1.23.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2026-01-28 01:58:23 common:924] Instantiating model from pre-trained checkpoint


[NeMo W 2026-01-28 01:58:23 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2026-01-28 01:58:23 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2026-01-28 01:58:24 features:289] PADDING: 16
[NeMo I 2026-01-28 01:58:24 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2026-01-28 01:58:24 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2026-01-28 01:58:24 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue
[NeMo I 2026-01-28 01:58:24 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2026-01-28 01:58:24 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2026-01-28 01:58:24 collections:446] Dataset loaded with 4 items, total duration of  0.04 hours.
[NeMo I 2026-01-28 01:58:24 collections:448] # 4 files loaded accounting to # 1 labels
[NeMo I 2026-01-28 01:58:25 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.
[NeMo I 2026-01-28 01:58:26

In [7]:
print(f"{'Index':<6} {'Start':>8} {'End':>8} {'Speaker':<12} {'Word'}")
print("-" * 60)

words_info = trans_info_dict[file_key]['words']
for idx, w in enumerate(words_info):
    print(f"{idx:<6} {w['start_time']:>8.2f} {w['end_time']:>8.2f} {w['speaker']:<12} {w['word']}")

speakers = set(w['speaker'] for w in words_info)
print(f"\n--- SUMMARY: {len(words_info)} words, {len(speakers)} speakers ---")

Index     Start      End Speaker      Word
------------------------------------------------------------
0          5.20     5.24 speaker_2    i
1          5.32     5.48 speaker_2    think
2          5.72     5.84 speaker_2    if
3          5.96     6.16 speaker_2    you're
4          6.24     6.28 speaker_2    a
5          6.40     6.64 speaker_2    leader
6          6.68     6.72 speaker_2    and
7          6.80     6.84 speaker_2    you
8          6.88     7.08 speaker_2    don't
9          7.20     7.64 speaker_2    understand
10         7.72     7.76 speaker_2    the
11         7.92     8.12 speaker_2    terms
12         8.20     8.24 speaker_2    that
13         8.32     8.48 speaker_2    you're
14         8.68     8.88 speaker_2    using
15         8.96     9.12 speaker_2    that's
16         9.20     9.48 speaker_2    probably
17         9.56     9.84 speaker_2    theirs
18         9.96    10.16 speaker_2    start
19        10.28    10.44 speaker_2    it's
20        10.48    10.