In [None]:
# Install required packages
!pip install nemo_toolkit[asr]
!pip install pyannote.audio
!pip install librosa soundfile


In [2]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [5]:
import os
import wget
import librosa
import soundfile as sf
import torch
from nemo.collections.asr.models import ClusteringDiarizer

# Set your audio file path
audio_file = "/content/drive/MyDrive/AIML/project/sample3.wav"

# Create output directory
os.makedirs("outputs", exist_ok=True)

# Configure diarization parameters
from omegaconf import DictConfig

diarizer_config = DictConfig({
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'diarizer': {
        'manifest_filepath': 'input_manifest.json',
        'out_dir': 'outputs/',
        'oracle_vad': False,  # Set to True if you have ground truth VAD
        'clustering': {
            'parameters': {
                'oracle_num_speakers': False,  # Set to True if you know exact number of speakers
                'max_num_speakers': 8,
                'enhanced_count_thres': 0.80,
                'max_rp_threshold': 0.25,
                'sparse_search_volume': 10
            }
        },
        'vad': {
            'model_path': 'vad_multilingual_marblenet',
            'parameters': {
                'onset': 0.8,
                'offset': 0.6,
                'pad_onset': 0.05,
                'pad_offset': -0.05,
                'min_duration_on': 0.2,
                'min_duration_off': 0.2
            }
        },
        'speaker_embeddings': {
            'model_path': 'titanet_large',
            'parameters': {
                'window_length_in_sec': 0.025,
                'shift_length_in_sec': 0.01,
                'multiscale_weights': None,
                'save_embeddings': False
            }
        }
    }
})

# Initialize the diarization model with config
diarizer = ClusteringDiarizer(cfg=diarizer_config)

# Create manifest file (required format for NeMo)
import json

def create_manifest(audio_file, manifest_path):
    """Create a manifest file for the audio input"""
    # Get audio duration
    duration = librosa.get_duration(filename=audio_file)

    manifest_entry = {
        "audio_filepath": audio_file,
        "offset": 0,
        "duration": duration,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None
    }

    with open(manifest_path, 'w') as f:
        json.dump(manifest_entry, f)
        f.write('\n')

# Create the manifest
manifest_path = 'input_manifest.json'
create_manifest(audio_file, manifest_path)

# Update config with the manifest path
diarizer_config.diarizer.manifest_filepath = manifest_path

# Run diarization
print("Starting diarization...")
diarizer.diarize()

# Read and display results
rttm_file = os.path.join('outputs', 'pred_rttms', os.path.basename(audio_file).replace('.wav', '.rttm'))

if os.path.exists(rttm_file):
    print("\nDiarization Results:")
    print("=" * 50)
    with open(rttm_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 8:
                start_time = float(parts[3])
                duration = float(parts[4])
                end_time = start_time + duration
                speaker = parts[7]
                print(f"Speaker {speaker}: {start_time:.2f}s - {end_time:.2f}s")

    print(f"\nFull RTTM file saved at: {rttm_file}")
else:
    print("Diarization completed but RTTM file not found. Check the outputs directory.")



print("\nDiarization complete!")

[NeMo I 2025-08-12 16:24:48 nemo_logging:393] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-08-12 16:24:48 nemo_logging:393] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /root/.cache/torch/NeMo/NeMo_2.4.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2025-08-12 16:24:49 nemo_logging:393] Instantiating model from pre-trained checkpoint


[NeMo W 2025-08-12 16:24:49 nemo_logging:405] Please use the EncDecSpeakerLabelModel instead of this model. EncDecClassificationModel model is kept for backward compatibility with older models.
[NeMo W 2025-08-12 16:24:49 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_

[NeMo I 2025-08-12 16:24:49 nemo_logging:393] PADDING: 16
[NeMo I 2025-08-12 16:24:49 nemo_logging:393] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.4.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.


ConfigAttributeError: Key 'window_length_in_sec' is not in struct
    full_key: diarizer.vad.parameters.window_length_in_sec
    object_type=dict