<a href="https://colab.research.google.com/github/chitimbwasc/chitimbwasc/blob/main/Laion_Pipeline_with_Dask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
from dask.distributed import Client
from dask import bag, delayed, compute, config
from faster_whisper import WhisperModel
import torch
import json
import dask
import whisperx
import pandas as pd
from omegaconf import OmegaConf
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from deepmultilingualpunctuation import PunctuationModel
from pydub import AudioSegment
import time

In [None]:
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 3
Total threads: 6,Total memory: 110.05 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42635,Workers: 3
Dashboard: http://127.0.0.1:8787/status,Total threads: 6
Started: Just now,Total memory: 110.05 GiB

0,1
Comm: tcp://127.0.0.1:36347,Total threads: 2
Dashboard: http://127.0.0.1:38599/status,Memory: 36.68 GiB
Nanny: tcp://127.0.0.1:43473,
Local directory: /tmp/dask-scratch-space/worker-vzjoqflx,Local directory: /tmp/dask-scratch-space/worker-vzjoqflx
GPU: Tesla V100-PCIE-16GB,GPU memory: 15.78 GiB

0,1
Comm: tcp://127.0.0.1:41163,Total threads: 2
Dashboard: http://127.0.0.1:38721/status,Memory: 36.68 GiB
Nanny: tcp://127.0.0.1:32961,
Local directory: /tmp/dask-scratch-space/worker-a83m_nv5,Local directory: /tmp/dask-scratch-space/worker-a83m_nv5
GPU: Tesla V100-PCIE-16GB,GPU memory: 15.78 GiB

0,1
Comm: tcp://127.0.0.1:34883,Total threads: 2
Dashboard: http://127.0.0.1:39957/status,Memory: 36.68 GiB
Nanny: tcp://127.0.0.1:41961,
Local directory: /tmp/dask-scratch-space/worker-q9207mhl,Local directory: /tmp/dask-scratch-space/worker-q9207mhl
GPU: Tesla V100-PCIE-16GB,GPU memory: 15.78 GiB


### Constants

In [None]:
# Constants
OUTPUT_PATH = Path("output")
OUTPUT_PATH.mkdir(exist_ok=True)

NEMO_DIARIZATION_CONFIG = Path("./diar_infer_telephonic.yaml")

WAV2VEC2_LANGUAGES = list(
    {
        "en",
        "fr",
        "de",
        "es",
        "it",
        "nl",
        "ja",
        "zh",
        "uk",
        "pt",
        "ar",
        "ru",
        "pl",
        "hu",
        "fi",
        "fa",
        "el",
        "tr",
    }
)
PUNCTUATION_MODEL_LANGUAGES = [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "nl",
    "pt",
    "bg",
    "pl",
    "cs",
    "sk",
    "sl",
]
SENTENCE_ENDING_PUNCTUATIONS = ".?!"

### Convert the audios to wav

In [None]:
def convert_audio_to_wav(audio_path: Path, output_path: Path = OUTPUT_PATH) -> Path:
    """
    Convert audio to WAV format using pydub.

    Parameters:
        audio_path (Path): The path to the input audio file.
        output_path (Path): The path to the output directory.

    Returns:
        Path: The path to the converted WAV file.
    """
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1)
    output_audio_path = output_path / f"{audio_path.stem}.wav"
    audio.export(output_audio_path, format="wav")
    return output_audio_path

In [None]:
audio_dir = Path('./inputs/')

### Get Whisper Transcription

#### Whisper models

| Size   | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|--------|------------|--------------------|--------------------|---------------|----------------|
| tiny   | 39 M       | tiny.en            | tiny               | ~1 GB         | ~32x           |
| base   | 74 M       | base.en            | base               | ~1 GB         | ~16x           |
| small  | 244 M      | small.en           | small              | ~2 GB         | ~6x            |
| medium | 769 M      | medium.en          | medium             | ~5 GB         | ~2x            |
| large  | 1550 M     | N/A                | large              | ~10 GB        | 1x             |


#### Large-v2 model on GPU

| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
| --- | --- | --- | --- | --- | --- |
| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
| faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |

#### Small model on CPU

| Implementation | Precision | Beam size | Time | Max. memory |
| --- | --- | --- | --- | --- |
| openai/whisper | fp32 | 5 | 10m31s | 3101MB |
| whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
| whisper.cpp | fp16 | 5 | 12m39s | 873MB |
| faster-whisper | fp32 | 5 | 2m44s | 1675MB |
| faster-whisper | int8 | 5 | 2m04s | 995MB |

In [None]:
MODEL_NAME = "tiny"
# MODEL_NAME = "large-v2"


# Initialize the model outside the function
whisper_model = None

def get_whisper_transcription(
    audio_path: Path, model_name=MODEL_NAME, device="cuda", beam_size=5
):
    global whisper_model
    if whisper_model is None:
        whisper_model = WhisperModel(model_name, device=device)

    segments, info = whisper_model.transcribe(
        str(audio_path),
        beam_size=beam_size,
        word_timestamps=True,
    )

    return [segment._asdict() for segment in segments], info

In [None]:
def create_nemo_diarization_config(
    audio_path: Path, nemo_config_path: Path, output_path: Path = OUTPUT_PATH
) -> OmegaConf:
    """
    Create and configure a NeMo diarization configuration object.
    Returns:
        OmegaConf: Configuration object for NeMo diarization.
    """
    config = OmegaConf.load(nemo_config_path)

    # Prepare data directory and meta-information
    data_dir = output_path / "data"
    data_dir.mkdir(parents=True, exist_ok=True)
    meta = {
        "audio_filepath": str(audio_path),
        "offset": 0,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }

    manifest_file_name = f"{audio_path.stem}_manifest.json"

    # Save meta information
    (data_dir / manifest_file_name).write_text(json.dumps(meta) + "\n")

    # Pretrained models
    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"

    config.num_workers = 1  # Avoid multiprocessing hang with IPython
    config.diarizer.manifest_filepath = str(data_dir / manifest_file_name)
    config.diarizer.out_dir = str(output_path / audio_path.stem)

    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
    config.diarizer.oracle_vad = False  # Use model-based VAD
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05
    config.diarizer.msdd_model.model_path = (
        "diar_msdd_telephonic"  # Telephonic speaker diarization model
    )

    return config

def get_nemo_diarization(audio_path: Path, nemo_config: OmegaConf):
    # Initialize NeMo MSDD diarization model
    msdd_model = NeuralDiarizer(cfg=nemo_config, ).to("cuda")
    msdd_model.diarize()
    return True

In [None]:
alignment_model_cache = {}

def align_transcription(
    audio_path: Path, whisper_results, info, supported_languages=WAV2VEC2_LANGUAGES, device="cuda"
):
    # Check if language is supported
    if info.language not in supported_languages:
        return [
            {"text": word[2], "start": word[0], "end": word[1]}
            for segment in whisper_results
            for word in segment["words"]
        ]

    global alignment_model_cache

    # Lazy loading of the alignment model
    if info.language not in alignment_model_cache:
        alignment_model, metadata = whisperx.load_align_model(
            language_code=info.language, device=device
        )
        alignment_model_cache[info.language] = (alignment_model, metadata)
    else:
        alignment_model, metadata = alignment_model_cache[info.language]

    result_aligned = whisperx.align(
        whisper_results, alignment_model, metadata, str(audio_path), device
    )

    return result_aligned["word_segments"]

In [None]:
def get_word_timestamp_anchor(start_time, end_time, option="start"):
    """
    Get the timestamp anchor for a word based on the given option.

    Parameters:
        start_time (int): The start time of the word in milliseconds.
        end_time (int): The end time of the word in milliseconds.
        option (str): The option for word timestamp anchor ("start", "mid", or "end").

    Returns:
        int: The timestamp anchor in milliseconds.
    """
    if option == "end":
        return end_time
    elif option == "mid":
        return (start_time + end_time) // 2
    return start_time


@delayed
def map_words_to_speaker(
    word_timestamps, speaker_timestamps, word_anchor_option="start"
):
    """
    Map each word to the corresponding speaker based on timestamps.

    Parameters:
        word_timestamps (list): A list of dictionaries containing word information.
        speaker_timestamps (list): A list of tuples containing speaker information.
        word_anchor_option (str): The option for word timestamp anchor ("start", "mid", or "end").

    Returns:
        list: A list of dictionaries containing the mapping of words to speakers.
    """
    speaker_start, speaker_end, speaker = speaker_timestamps[0]
    word_speaker_mapping = []
    turn_index = 0

    for word_dict in word_timestamps:
        word_start_ms = int(word_dict["start"] * 1000)
        word_end_ms = int(word_dict["end"] * 1000)
        word_text = word_dict["word"]

        word_position = get_word_timestamp_anchor(
            word_start_ms, word_end_ms, word_anchor_option
        )

        while word_position > float(speaker_end):
            turn_index += 1
            turn_index = min(turn_index, len(speaker_timestamps) - 1)
            speaker_start, speaker_end, speaker = speaker_timestamps[turn_index]

            if turn_index == len(speaker_timestamps) - 1:
                speaker_end = get_word_timestamp_anchor(
                    word_start_ms, word_end_ms, option="end"
                )

        word_speaker_mapping.append(
            {
                "word": word_text,
                "start_time": word_start_ms,
                "end_time": word_end_ms,
                "speaker": speaker,
            }
        )

    return word_speaker_mapping


@delayed
def read_speaker_timestamps(rttm_file: Path) -> list:
    """
    Read the speaker timestamps from an RTTM file.

    Parameters:
        rttm_file (Path): The path to the RTTM file.

    Returns:
        list: A list of tuples containing the start time, end time, and speaker label.
    """
    speaker_timestamps = []
    with rttm_file.open("r") as f:
        for line in f:
            parts = line.split(" ")
            start_time = int(float(parts[5]) * 1000)
            end_time = start_time + int(float(parts[8]) * 1000)
            speaker = int(parts[11].split("_")[-1])
            speaker_timestamps.append((start_time, end_time, speaker))
    return speaker_timestamps


def is_word_sentence_end(x, word_list, sentence_endings):
    return x >= 0 and word_list[x][-1] in sentence_endings


def get_first_word_idx_of_sentence(
    word_idx, word_list, speaker_list, max_words, sentence_endings
):
    left_idx = word_idx
    while (
        left_idx > 0
        and left_idx - left_idx < max_words
        and speaker_list[left_idx - 1] == speaker_list[left_idx]
        and not is_word_sentence_end(left_idx - 1, word_list, sentence_endings)
    ):
        left_idx -= 1

    return (
        left_idx
        if left_idx == 0
        or is_word_sentence_end(left_idx - 1, word_list, sentence_endings)
        else -1
    )


def get_last_word_idx_of_sentence(word_idx, word_list, max_words, sentence_endings):
    right_idx = word_idx
    while (
        right_idx < len(word_list)
        and right_idx - right_idx < max_words
        and not is_word_sentence_end(right_idx, word_list, sentence_endings)
    ):
        right_idx += 1

    return (
        right_idx
        if right_idx == len(word_list) - 1
        or is_word_sentence_end(right_idx, word_list, sentence_endings)
        else -1
    )


def get_realigned_ws_mapping_with_punctuation(
    word_speaker_mapping,
    max_words_in_sentence=50,
    sentence_endings=SENTENCE_ENDING_PUNCTUATIONS,
):
    wsp_len = len(word_speaker_mapping)

    words_list, speaker_list = [], []
    for k, line_dict in enumerate(word_speaker_mapping):
        word, speaker = line_dict["word"], line_dict["speaker"]
        words_list.append(word)
        speaker_list.append(speaker)

    k = 0
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k]
        if (
            k < wsp_len - 1
            and speaker_list[k] != speaker_list[k + 1]
            and not is_word_sentence_end(k, words_list, sentence_endings)
        ):
            left_idx = get_first_word_idx_of_sentence(
                k, words_list, speaker_list, max_words_in_sentence, sentence_endings
            )
            right_idx = (
                get_last_word_idx_of_sentence(
                    k,
                    words_list,
                    max_words_in_sentence - k + left_idx - 1,
                    sentence_endings,
                )
                if left_idx > -1
                else -1
            )
            if min(left_idx, right_idx) == -1:
                k += 1
                continue

            spk_labels = speaker_list[left_idx : right_idx + 1]
            mod_speaker = max(set(spk_labels), key=spk_labels.count)
            if spk_labels.count(mod_speaker) < len(spk_labels) // 2:
                k += 1
                continue

            speaker_list[left_idx : right_idx + 1] = [mod_speaker] * (
                right_idx - left_idx + 1
            )
            k = right_idx

        k += 1

    k, realigned_list = 0, []
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k].copy()
        line_dict["speaker"] = speaker_list[k]
        realigned_list.append(line_dict)
        k += 1

    return realigned_list


def get_sentences_speaker_mapping(
    word_speaker_mapping, spk_ts, audio_path
):
    s, e, spk = spk_ts[0]
    prev_spk = spk

    mapping = []
    sentence_mapping = {
        "speaker": f"Speaker {spk}",
        "start_time": s,
        "end_time": e,
        "text": "",
        "audio": audio_path.name,
    }

    for wrd_dict in word_speaker_mapping:
        wrd, spk = wrd_dict["word"], wrd_dict["speaker"]
        s, e = wrd_dict["start_time"], wrd_dict["end_time"]
        if spk != prev_spk:
            mapping.append(sentence_mapping)
            sentence_mapping = {
                "audio": audio_path.name,
                "speaker": f"Speaker {spk}",
                "start_time": s,
                "end_time": e,
                "text": "",
            }
        else:
            sentence_mapping["end_time"] = e
        sentence_mapping["text"] += f"{wrd} "
        prev_spk = spk

    mapping.append(sentence_mapping)
    return mapping

@delayed
def process_transcription(
    info,
    word_speaker_mapping,
    speaker_timestamps,
    audio_path,
    punctuation_model_languages=PUNCTUATION_MODEL_LANGUAGES,
):
    if info.language in punctuation_model_languages:
        # Restoring punctuation in the transcript to help realign the sentences
        punct_model = PunctuationModel(model="kredor/punctuate-all")
        words_list = [x["word"] for x in word_speaker_mapping]
        labled_words = punct_model.predict(words_list)

        ending_puncts = SENTENCE_ENDING_PUNCTUATIONS
        model_puncts = ".,;:!?"

        # We don't want to punctuate U.S.A. with a period. Right?
        def is_acronym(x):
            return re.fullmatch("\\b(?:[a-zA-Z]\\.){2,}", x)

        for word_dict, labeled_tuple in zip(word_speaker_mapping, labled_words):
            word = word_dict["word"]
            if (
                word
                and labeled_tuple[1] in ending_puncts
                and (word[-1] not in model_puncts or is_acronym(word))
            ):
                word += labeled_tuple[1]
                if word.endswith(".."):
                    word = word.rstrip(".")
                word_dict["word"] = word

        wsm = get_realigned_ws_mapping_with_punctuation(word_speaker_mapping)
    else:
        print(f"Punctuation restoration is not available for {info.language} language.")
        wsm = word_speaker_mapping

    return get_sentences_speaker_mapping(
        wsm, speaker_timestamps, audio_path=audio_path
    )

In [None]:
start_time = time.time()

# 1. Convert audios to WAV format
sequence_bag = bag.from_sequence(audio_dir.iterdir())
converted_audio_paths = sequence_bag.map(convert_audio_to_wav).compute()

# Create a new bag from the converted audio paths for further processing
converted_audio_bag = bag.from_sequence(converted_audio_paths)

# 2. Parallel execution of Whisper transcription, align_bag, and creation of nemo_configs
whisper_transcriptions_bag = converted_audio_bag.map(get_whisper_transcription)

# Unpack the results from `get_whisper_transcription`
whisper_results_bag = whisper_transcriptions_bag.map(lambda x: x[0])
info_bag = whisper_transcriptions_bag.map(lambda x: x[1])

# Align Transcriptions
align_bag = bag.map(align_transcription, converted_audio_bag, whisper_results_bag, info_bag)

# Nemo Configs
nemo_configs_bag = converted_audio_bag.map(create_nemo_diarization_config, nemo_config_path=NEMO_DIARIZATION_CONFIG)

# Compute all results in parallel
whisper_transcriptions, aligned_results, nemo_configs = compute(whisper_transcriptions_bag, align_bag, nemo_configs_bag)

# 3. NeMo diarization
# for audio, config in zip(converted_audio_paths, nemo_configs):
#     get_nemo_diarization(audio, config)

# Todo: Optimize Performance or add batch processing
from concurrent.futures import ThreadPoolExecutor

def diarize_wrapper(args):
    audio_path, nemo_config = args
    return get_nemo_diarization(audio_path, nemo_config)

# Set a suitable number for max_workers based on your machine's capabilities
# A good starting point might be the number of CPU cores available.
with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(diarize_wrapper, zip(converted_audio_paths, nemo_configs)))
# Todo: end

# 4. Generate RTTM file paths
rttm_file_paths = [OUTPUT_PATH / audio_path.stem /"pred_rttms" / f"{audio_path.stem}.rttm" for audio_path in converted_audio_paths]

# 5. Parallel execution of the remaining tasks
speaker_timestamps_tasks = [read_speaker_timestamps(path) for path in rttm_file_paths]
speaker_timestamps = compute(*speaker_timestamps_tasks)

map_words_tasks = [map_words_to_speaker(aligned, speaker) for aligned, speaker in zip(aligned_results, speaker_timestamps)]
mapped_words = compute(*map_words_tasks)

processed_audio_data_tasks = [process_transcription(info, map_word, speaker, audio)
                              for info, map_word, speaker, audio in zip(info_bag, mapped_words, speaker_timestamps, converted_audio_paths)]

processed_data = compute(*processed_audio_data_tasks)

# Create a Dataframe
df = pd.concat([pd.DataFrame(item) for item in processed_data], ignore_index=True)

end_time = time.time()

[NeMo I 2023-09-20 15:02:29 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2023-09-20 15:02:29 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2023-09-20 15:02:29 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2023-09-20 15:02:29 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2023-09-20 15:02:29 cloud:58] Found existing object /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2023-09-20 15:02:29 cloud:64] Re-using file from: /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2023-09-20 15:02:29 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-09-20 15:02:29 cloud:58] Found existing object /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/diar_msdd_telephonic/3c3697a0a4

[NeMo W 2023-09-20 15:02:33 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2023-09-20 15:02:33 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2023-09-20 15:02:33 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validatio

[NeMo I 2023-09-20 15:02:33 features:289] PADDING: 16


[NeMo W 2023-09-20 15:02:33 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).
    Test config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    seq_eval_mode: false
    


[NeMo I 2023-09-20 15:02:33 features:289] PADDING: 16


[NeMo W 2023-09-20 15:02:33 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2023-09-20 15:02:33 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).
    Test config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    seq_eval_mode: false
    


[NeMo I 2023-09-20 15:02:33 features:289] PADDING: 16


[NeMo W 2023-09-20 15:02:33 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2023-09-20 15:02:33 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2023-09-20 15:02:33 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2023-09-20 15:02:33 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:34 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:34 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:34 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:34 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:37 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2023-09-20 15:02:37 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2023-09-20 15:02:37 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:37 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:37 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /home/azureuser/.cache/tor

[NeMo W 2023-09-20 15:02:39 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-09-20 15:02:39 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:39 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2023-09-20 15:02:39 cloud:58] Found existing object /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:39 cloud:64] Re-using file from: /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2023-09-20 15:02:39 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-09-20 15:02:39 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-09-20 15:02:39 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:39 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2023-09-20 15:02:39 cloud:58] Found existing object /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:39 cloud:64] Re-using file from: /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2023-09-20 15:02:39 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-09-20 15:02:39 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-09-20 15:02:39 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:39 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2023-09-20 15:02:39 cloud:58] Found existing object /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:39 cloud:64] Re-using file from: /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2023-09-20 15:02:39 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-09-20 15:02:39 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.


[NeMo W 2023-09-20 15:02:40 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-09-20 15:02:40 features:289] PADDING: 16
[NeMo I 2023-09-20 15:02:40 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:40 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:40 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /home/azureuser/.cache/torch/NeMo/NeMo_1.20.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-09-20 15:02:40 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2023-09-20 15:02:40 msdd_models:865] Clustering Parameters: {
        "oracle_num_s

splitting manifest:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:40 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2023-09-20 15:02:40 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false
    }
[NeMo I 2023-09-20 15:02:40 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:40 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest:   0%|          | 0/1 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:40 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:40 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue




splitting manifest:   0%|          | 0/1 [00:00<?, ?it/s][A[A

[NeMo I 2023-09-20 15:02:40 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:40 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue





splitting manifest:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A
splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  9.56it/s][A

[NeMo I 2023-09-20 15:02:40 classification_models:272] Perform streaming frame-level VAD





[NeMo I 2023-09-20 15:02:40 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:40 collections:302] Dataset loaded with 1 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:40 collections:304] # 1 files loaded accounting to # 1 labels



splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  2.19it/s]



splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  2.97it/s][A[A[A


splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s][A[A

[NeMo I 2023-09-20 15:02:41 classification_models:272] Perform streaming frame-level VAD





[NeMo I 2023-09-20 15:02:41 classification_models:272] Perform streaming frame-level VAD
[NeMo I 2023-09-20 15:02:41 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:41 classification_models:272] Perform streaming frame-level VAD
[NeMo I 2023-09-20 15:02:41 collections:302] Dataset loaded with 3 items, total duration of  0.04 hours.
[NeMo I 2023-09-20 15:02:41 collections:304] # 3 files loaded accounting to # 1 labels
[NeMo I 2023-09-20 15:02:41 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:41 collections:302] Dataset loaded with 2 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:41 collections:304] # 2 files loaded accounting to # 1 labels
[NeMo I 2023-09-20 15:02:41 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:41 collections:302] Dataset loaded with 2 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:41

vad:   0%|          | 0/3 [00:00<?, ?it/s]

vad:   0%|          | 0/2 [00:00<?, ?it/s][A[A


vad:   0%|          | 0/2 [00:00<?, ?it/s][A[A[A
vad: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s][A

[NeMo I 2023-09-20 15:02:41 clustering_diarizer:250] Generating predictions with overlapping input segments




vad:  67%|██████▋   | 2/3 [00:01<00:00,  1.76it/s]it/s][A

vad:  50%|█████     | 1/2 [00:01<00:01,  1.14s/it][A[A


vad:  50%|█████     | 1/2 [00:01<00:01,  1.01s/it][A[A[A
generating preds: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s][A
                                                               [A

[NeMo I 2023-09-20 15:02:42 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A

vad: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s][A[A


vad: 100%|██████████| 2/2 [00:01<00:00,  1.47it/s][A[A[A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.60it/s][A

[NeMo I 2023-09-20 15:02:42 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, output/speech-with-music/speaker_outputs/subsegments_scale0.json



vad: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
vad: 100%|██████████| 3/3 [00:01<00:00,  1.77it/s]

[NeMo I 2023-09-20 15:02:42 clustering_diarizer:250] Generating predictions with overlapping input segments



vad: 100%|██████████| 2/2 [00:01<00:00,  1.18it/s]

[NeMo I 2023-09-20 15:02:43 clustering_diarizer:250] Generating predictions with overlapping input segments
[NeMo I 2023-09-20 15:02:43 clustering_diarizer:250] Generating predictions with overlapping input segments



generating preds:   0%|          | 0/1 [00:00<?, ?it/s]
generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:43 clustering_diarizer:343] Extracting embeddings for Diarization




generating preds:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:43 collections:301] Filtered duration for loading collection is  0.00 hours.


[A[A

[NeMo I 2023-09-20 15:02:43 collections:302] Dataset loaded with 7 items, total duration of  0.00 hours.
[NeMo I 2023-09-20 15:02:43 collections:304] # 7 files loaded accounting to # 1 labels





[1/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A


[1/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s][A[A[A


[NeMo I 2023-09-20 15:02:43 clustering_diarizer:389] Saved embedding files to output/speech-with-music/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:43 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, output/speech-with-music/speaker_outputs/subsegments_scale1.json
[NeMo I 2023-09-20 15:02:43 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:43 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:43 collections:302] Dataset loaded with 8 items, total duration of  0.00 hours.
[NeMo I 2023-09-20 15:02:43 collections:304] # 8 files loaded accounting to # 1 labels





[2/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A


[2/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  4.26it/s][A[A[A

[NeMo I 2023-09-20 15:02:43 clustering_diarizer:389] Saved embedding files to output/speech-with-music/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:43 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, output/speech-with-music/speaker_outputs/subsegments_scale2.json





[NeMo I 2023-09-20 15:02:43 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:43 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:43 collections:302] Dataset loaded with 10 items, total duration of  0.00 hours.
[NeMo I 2023-09-20 15:02:43 collections:304] # 10 files loaded accounting to # 1 labels





[3/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A


[3/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.14it/s][A[A[A
generating preds: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it][A
                                                               [A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.


[3/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.97it/s]

creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:389] Saved embedding files to output/speech-with-music/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, output/speech-with-music/speaker_outputs/subsegments_scale3.json
[NeMo I 2023-09-20 15:02:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:44 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:44 collections:302] Dataset loaded with 12 items, total duration of  0.00 hours.
[NeMo I 2023-09-20 15:02:44 collections:304] # 12 files loaded accounting to # 1 labels





[4/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  6.89it/s][A


generating preds: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it][A[A

                                                               [A[A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, output/job-interview/speaker_outputs/subsegments_scale0.json





[4/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  5.54it/s][A[A[A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:44 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:44 collections:302] Dataset loaded with 57 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:44 collections:304] # 57 files loaded accounting to # 1 labels


[4/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]


[1/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  4.90it/s][A


[NeMo I 2023-09-20 15:02:44 clustering_diarizer:389] Saved embedding files to output/speech-with-music/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, output/speech-with-music/speaker_outputs/subsegments_scale4.json
[NeMo I 2023-09-20 15:02:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:44 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:44 collections:302] Dataset loaded with 17 items, total duration of  0.00 hours.
[NeMo I 2023-09-20 15:02:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, output/speech/speaker_outputs/subsegments_scale0.json
[NeMo I 2023-09-20 15:02:44 collections:304] # 17 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:44 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:44 collections:302] Dataset loaded with 48 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:44 collections:304] # 48 files loaded accounting to # 1 labels





[1/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A
[5/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.77it/s][A

[5/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s][A[A
[1/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:389] Saved embedding files to output/speech-with-music/speaker_outputs/embeddings




clustering:   0%|          | 0/1 [00:00<?, ?it/s][A


[1/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s][A[A[A

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:389] Saved embedding files to output/job-interview/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:45 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, output/job-interview/speaker_outputs/subsegments_scale1.json
[NeMo I 2023-09-20 15:02:45 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:45 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:45 collections:302] Dataset loaded with 64 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:45 collections:304] # 64 files loaded accounting to # 1 labels




[1/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
                                                               

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:389] Saved embedding files to output/speech/speaker_outputs/embeddings



clustering: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s][A

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, output/speech/speaker_outputs/subsegments_scale1.json


clustering: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]


[2/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s][A[A

[NeMo I 2023-09-20 15:02:45 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:45 clustering_diarizer:464] Outputs are saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/diffusion/code/Users/yaman/optimization-laion/output/speech-with-music directory


[NeMo W 2023-09-20 15:02:45 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:45 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:45 collections:302] Dataset loaded with 54 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:45 collections:304] # 54 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]

[NeMo I 2023-09-20 15:02:45 msdd_models:960] Loading embedding pickle file of scale:0 at output/speech-with-music/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl



creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  4.51it/s]


[NeMo I 2023-09-20 15:02:45 msdd_models:960] Loading embedding pickle file of scale:1 at output/speech-with-music/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2023-09-20 15:02:45 clustering_diarizer:389] Saved embedding files to output/job-interview/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:45 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, output/job-interview/speaker_outputs/subsegments_scale2.json
[NeMo I 2023-09-20 15:02:45 msdd_models:960] Loading embedding pickle file of scale:2 at output/speech-with-music/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2023-09-20 15:02:45 msdd_models:960] Loading embedding pickle file of scale:3 at output/speech-with-music/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2023-09-20 15:02:46 msdd_models:960] Loading embedding pickle file of scale:4 at output/speech-with-music/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 20

[3/5] extract embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:46 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:46 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:46 collections:302] Dataset loaded with 68 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:46 collections:304] # 68 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/2 [00:00<?, ?it/s][A[A

[NeMo I 2023-09-20 15:02:46 collections:617] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-09-20 15:02:46 collections:620] Total 1 session files loaded accounting to # 1 audio clips





  0%|          | 0/1 [00:00<?, ?it/s][A[A[A
100%|██████████| 1/1 [00:00<00:00, 21.83it/s]1 [00:00<00:00,  1.98it/s][A

[NeMo I 2023-09-20 15:02:46 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2023-09-20 15:02:46 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:46 speaker_utils:93] Number of files to diarize: 1



[3/5] extract embeddings:  50%|█████     | 1/2 [00:00<00:00,  3.12it/s][NeMo W 2023-09-20 15:02:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:46 speaker_utils:93] Number of files to diarize: 1


[2/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
[NeMo W 2023-09-20 15:02:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:46 speaker_utils:93] Number of files to diarize: 1


[3/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.71it/s]

[NeMo I 2023-09-20 15:02:46 clustering_diarizer:389] Saved embedding files to output/speech/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:46 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, output/speech/speaker_outputs/subsegments_scale2.json



[NeMo W 2023-09-20 15:02:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:46 msdd_models:1431]   
    
[NeMo I 2023-09-20 15:02:46 clustering_diarizer:389] Saved embedding files to output/job-interview/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:46 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, output/job-interview/speaker_outputs/subsegments_scale3.json
[NeMo I 2023-09-20 15:02:46 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:46 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:46 collections:302] Dataset loaded with 61 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:46 collections:304] # 61 files loaded accounting to # 1 labels


[3/5] extract embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:46 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:46 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:46 collections:302] Dataset loaded with 98 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:46 collections:304] # 98 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.65it/s]

[3/5] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s][A[A
[1/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.57it/s]

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:389] Saved embedding files to output/speech/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, output/speech/speaker_outputs/subsegments_scale3.json




[4/5] extract embeddings:  50%|█████     | 1/2 [00:00<00:00,  2.98it/s][A

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:47 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:47 collections:302] Dataset loaded with 82 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:47 collections:304] # 82 files loaded accounting to # 1 labels


[4/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.13it/s]

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:389] Saved embedding files to output/devil-wears-prada/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, output/devil-wears-prada/speaker_outputs/subsegments_scale1.json





[NeMo I 2023-09-20 15:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:47 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:47 collections:302] Dataset loaded with 82 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:47 collections:304] # 82 files loaded accounting to # 1 labels



[2/5] extract embeddings:   0%|          | 0/2 [00:00<?, ?it/s][A

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:389] Saved embedding files to output/job-interview/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, output/job-interview/speaker_outputs/subsegments_scale4.json


[4/5] extract embeddings:  50%|█████     | 1/2 [00:00<00:00,  3.21it/s]

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:47 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:47 collections:302] Dataset loaded with 142 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:47 collections:304] # 142 files loaded accounting to # 1 labels




[4/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.72it/s]
[4/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  3.69it/s][A

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:389] Saved embedding files to output/speech/speaker_outputs/embeddings



[2/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.56it/s]

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, output/speech/speaker_outputs/subsegments_scale4.json





[5/5] extract embeddings:  33%|███▎      | 1/3 [00:00<00:00,  3.27it/s][A[A

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:47 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:47 collections:302] Dataset loaded with 120 items, total duration of  0.01 hours.
[NeMo I 2023-09-20 15:02:47 collections:304] # 120 files loaded accounting to # 1 labels
[NeMo I 2023-09-20 15:02:47 clustering_diarizer:389] Saved embedding files to output/devil-wears-prada/speaker_outputs/embeddings


[5/5] extract embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, output/devil-wears-prada/speaker_outputs/subsegments_scale2.json
[NeMo I 2023-09-20 15:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:47 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:47 collections:302] Dataset loaded with 95 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:47 collections:304] # 95 files loaded accounting to # 1 labels



[3/5] extract embeddings:   0%|          | 0/2 [00:00<?, ?it/s][A

[5/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00,  4.39it/s][A[A
[5/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  5.29it/s]

[NeMo I 2023-09-20 15:02:48 clustering_diarizer:389] Saved embedding files to output/job-interview/speaker_outputs/embeddings



clustering:   0%|          | 0/1 [00:00<?, ?it/s]
[3/5] extract embeddings:  50%|█████     | 1/2 [00:00<00:00,  2.70it/s][A

[NeMo I 2023-09-20 15:02:48 clustering_diarizer:389] Saved embedding files to output/speech/speaker_outputs/embeddings




[3/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  3.66it/s]


[NeMo I 2023-09-20 15:02:48 clustering_diarizer:389] Saved embedding files to output/devil-wears-prada/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:48 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, output/devil-wears-prada/speaker_outputs/subsegments_scale3.json
[NeMo I 2023-09-20 15:02:48 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:48 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:48 collections:302] Dataset loaded with 124 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:48 collections:304] # 124 files loaded accounting to # 1 labels


clustering: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
clustering: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]?it/s][A


clustering: 100%|██████████| 1/1 [00:00<00:00,  2.46it/s][A[A

[NeMo I 2023-09-20 15:02:48 clustering_diarizer:464] Outputs are saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/diffusion/code/Users/yaman/optimization-laion/output/job-interview directory



[NeMo W 2023-09-20 15:02:48 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:0 at output/job-interview/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:1 at output/job-interview/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 clustering_diarizer:464] Outputs are saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/diffusion/code/Users/yaman/optimization-laion/output/speech directory


[NeMo W 2023-09-20 15:02:48 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:0 at output/speech/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:2 at output/job-interview/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:1 at output/speech/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:3 at output/job-interview/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:2 at output/speech/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2023-09-20 15:02:48 msdd_models:960] Loading embedding pickle file of scale:4 at output/job-interview/speaker_outputs/embeddings/subsegments_scale4_embeddings


[4/5] extract embeddings:  50%|█████     | 1/2 [00:00<00:00,  3.11it/s][A

[NeMo I 2023-09-20 15:02:49 collections:617] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-09-20 15:02:49 collections:620] Total 1 session files loaded accounting to # 1 audio clips


  0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-09-20 15:02:49 collections:617] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-09-20 15:02:49 collections:620] Total 1 session files loaded accounting to # 1 audio clips




100%|██████████| 1/1 [00:00<00:00, 48.31it/s]

[NeMo I 2023-09-20 15:02:49 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1



100%|██████████| 1/1 [00:00<00:00, 25.78it/s]

[NeMo I 2023-09-20 15:02:49 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1



[4/5] extract embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
[NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:49 clustering_diarizer:389] Saved embedding files to output/devil-wears-prada/speaker_outputs/embeddings
[NeMo I 2023-09-20 15:02:49 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, output/devil-wears-prada/speaker_outputs/subsegments_scale4.json


[NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:49 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-09-20 15:02:49 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-09-20 15:02:49 collections:302] Dataset loaded with 190 items, total duration of  0.02 hours.
[NeMo I 2023-09-20 15:02:49 collections:304] # 190 files loaded accounting to # 1 labels


[5/5] extract embeddings:   0%|          | 0/3 [00:00<?, ?it/s][NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 msdd_models:1431]   
    


[NeMo W 2023-09-20 15:02:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:49 msdd_models:1431]   
    


[5/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00,  6.80it/s]

[NeMo I 2023-09-20 15:02:49 clustering_diarizer:389] Saved embedding files to output/devil-wears-prada/speaker_outputs/embeddings



clustering: 100%|██████████| 1/1 [00:00<00:00,  4.37it/s]

[NeMo I 2023-09-20 15:02:50 clustering_diarizer:464] Outputs are saved in /mnt/batch/tasks/shared/LS_root/mounts/clusters/diffusion/code/Users/yaman/optimization-laion/output/devil-wears-prada directory



[NeMo W 2023-09-20 15:02:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:50 msdd_models:960] Loading embedding pickle file of scale:0 at output/devil-wears-prada/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2023-09-20 15:02:50 msdd_models:960] Loading embedding pickle file of scale:1 at output/devil-wears-prada/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2023-09-20 15:02:50 msdd_models:960] Loading embedding pickle file of scale:2 at output/devil-wears-prada/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2023-09-20 15:02:50 msdd_models:960] Loading embedding pickle file of scale:3 at output/devil-wears-prada/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2023-09-20 15:02:50 msdd_models:960] Loading embedding pickle file of scale:4 at output/devil-wears-prada/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2023-09-20 15:02:50 msdd_models:938] Loading cluster label file from output/devil-wears-prada/speaker_outputs/subsegm

100%|██████████| 1/1 [00:00<00:00, 41.62it/s]

[NeMo I 2023-09-20 15:02:50 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2023-09-20 15:02:50 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-09-20 15:02:50 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2023-09-20 15:02:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-09-20 15:02:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-09-20 15:02:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-09-20 15:02:50 msdd_models:1431]   
    


In [None]:
end_time - start_time

43.36835193634033

In [None]:
df

Unnamed: 0,speaker,start_time,end_time,text,audio
0,Speaker 2,7020,7000,Who are you?,devil-wears-prada.wav
1,Speaker 0,8523,13360,My name is Andy Sachs. I recently graduated fr...,devil-wears-prada.wav
2,Speaker 2,13740,14633,What are you doing here?,devil-wears-prada.wav
3,Speaker 0,16601,36600,"Well, I think I could do a good job as your as...",devil-wears-prada.wav
4,Speaker 2,37320,38299,So you don't read runway?,devil-wears-prada.wav
5,Speaker 0,40911,41035,No.,devil-wears-prada.wav
6,Speaker 2,41600,43271,And before today you had never heard of me?,devil-wears-prada.wav
7,Speaker 0,46760,46887,No.,devil-wears-prada.wav
8,Speaker 2,47560,50119,And you have no style or some sort of fashion?,devil-wears-prada.wav
9,Speaker 0,53806,55917,"Well, I think that depends on what you're...",devil-wears-prada.wav
