In [1]:
import ffmpeg
import numpy as np
import wget
import os
import time
import pandas as pd
import soundfile as sf
import whisper
import json
import re
import datetime
import subprocess

from omegaconf import OmegaConf
from nemo.collections.asr.models import ClusteringDiarizer, NeuralDiarizer
from pathlib import Path
from pydub import AudioSegment

os.environ['PYTHONIOENCODING'] = 'utf-8'

### Main Transcription Pipeline

FFmpeg is used to extract the audio from the video files

In [2]:
def extract_audio(input_video, output_audio, start_time='00:00:00', duration='00:15:00'):
    # Check if output file exists and remove it
    if os.path.exists(output_audio):
        os.remove(output_audio)

    # Use ffmpeg to extract the audio
    try:
        ffmpeg.input(input_video, ss=start_time, t=duration).output(output_audio, qscale=0, ar=16000, ac=1).run(overwrite_output=True, capture_stdout=True)
    except ffmpeg.Error as e:
        raise e

OpenAI's Whisper model is used to transcribe the audio into text, the transcribe_section function takes a segment of audio and transcribes the segment into text.

In [3]:
model = whisper.load_model("base.en")  # Load the Whisper model in English with the "base.en" configuration

# Function to extract and transcribe a section of audio
def transcribe_section(audio, start_time, end_time, threshold):
    # Extract the section from the audio based on start and end times
    section = audio[start_time*1000:end_time*1000]  # Convert times from seconds to milliseconds
    
    # Export the audio section to a temporary file
    temp_file = "temp_section.wav"  # Temporary file name
    section.export(temp_file, format="wav")  # Save the section as a .wav file for transcription
    
    try:
        # Transcribe the temporary file using Whisper
        result = model.transcribe(temp_file)
        
        # Initialize an empty string to hold the filtered transcription
        filtered_text = ""
        
        # If there are segments in the transcription result, process each segment
        if len(result["segments"]) > 0:
            for segment in result["segments"]:
                # Only include text from segments with a no-speech probability below the threshold
                if segment["no_speech_prob"] < threshold:
                    filtered_text += segment["text"]  # Append the transcribed text to the output
                
    except subprocess.CalledProcessError as e:
        # Handle errors from subprocess (e.g., issues with audio extraction)
        return ''
    except RuntimeError as e:
        # Print a runtime error message and return an empty string if a RuntimeError occurs
        print(f"RuntimeError: Error processing section from {start_time} to {end_time}: {e}")
        return ''
    except Exception as e:
        # Print a general error message for any other exceptions
        print(f"Unexpected error: Error processing section from {start_time} to {end_time}: {e}")
        return ''

    return filtered_text  # Return the filtered transcription text


We only want to process mp4 files, so this function checks to see if the files are mp4 files, or start with "._", which are metadata files generated on macOS which we don't want to process.

The single parameter is used to select a single file for testing purposes.

In [4]:
# Check if a file should be processed
def should_process_file(file_path, existing_files, single):
    file_name = file_path.stem
    if file_name in existing_files or \
       (file_path.suffix not in ['.mp4', '.MP4']) or \
       file_name.startswith('._') or \
       (single is not None and file_name not in single):
        return False
    return True

This section uses NVIDIA's NeMo diarization model to diarize the extracted audio. Diarization is the process of identifying different speakers in audio, which is necessary to produce accurate transcriptions. The hyperparameters were chosen by following the guide found here: https://github.com/NVIDIA/NeMo/blob/main/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb

In [5]:
# Diarize the audio to separate speakers
def diarize_audio(output_audio, model_config_url="https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"):
    # Prepare metadata for NeMo diarization
    meta = {'audio_filepath': output_audio, 'offset': 0, 'duration': None, 'label': 'infer', 'text': '-', 'num_speakers': None, 'rttm_filepath': None, 'uem_filepath': None}
    with open("input_manifest.json", 'w', encoding='utf-8') as fp:
        json.dump(meta, fp)
        fp.write('\n')

    # Load or download diarizer configuration
    model_config_path = "model_data/diar_infer_telephonic.yaml"
    if not os.path.exists(model_config_path):
        os.makedirs("model_data", exist_ok=True)
        model_config_path = wget.download(model_config_url, "model_data")
    
    # Set configuration parameters
    config = OmegaConf.load(model_config_path)
    config.diarizer.manifest_filepath = "input_manifest.json"
    config.diarizer.out_dir = "oracle_vad"
    config.device = "cpu" # CHANGE THIS TO cuda TO USE NVIDIA GPU
    config.num_workers = 0
    
    # Initialize and run the diarization model
    diarizer_model = NeuralDiarizer(cfg=config)
    diarizer_model.diarize()

The output of the diarization is a rttm file which we use to extract the audio segments attributed to each speaker.

The threshold paremeter is used to ignore sections that are shorter than the threshold. The default value is 0.5, which ignores all audio sections that are shorter than 0.5 seconds.

In [6]:
# Extract and aggregate speaker segments
def get_speaker_segments(rttm_path, time_threshold=0.5):
    columns = ['type', 'file_id', 'channel_id', 'begin_time', 'duration', 'ortho', 'speaker_type', 'speaker_name', 'confidence_score', 'signal_lookahead']
    rttm_df = pd.read_csv(rttm_path, delim_whitespace=True, names=columns, comment='#')

    audio_sections = []
    current_section = None
    for _, row in rttm_df.iterrows():
        begin_time, end_time, speaker = row["begin_time"], row["begin_time"] + row["duration"], row["speaker_name"]
        if row["duration"] > time_threshold:
            if current_section is None or current_section[2] != speaker or begin_time - current_section[1] >= 1:
                if current_section:
                    audio_sections.append(current_section)
                current_section = [begin_time, end_time, speaker]
            else:
                current_section[1] = end_time
    if current_section:
        audio_sections.append(current_section)

    return audio_sections

With the extracted segments, we can the transcribe each section using Whisper.

Whisper outputs a value that represents the percent chance the section isn't speech. The threshold parameter ignores sections where this chance is above the threshold. By default it is set to 0.9, which ignores all segements with a 90% or higher chance to not be speech.

In [7]:
# Transcribe each speaker section
def transcribe_sections(audio, audio_sections, prob_threshold=0.9):
    transcriptions = []
    first_speech = audio_sections[0][0] if audio_sections else 0

    for start_time, end_time, speaker in audio_sections:
        text = transcribe_section(audio, start_time, end_time, prob_threshold)
        if text:
            transcriptions.append([start_time - first_speech, end_time - first_speech, speaker, text])
    return pd.DataFrame(transcriptions, columns=["Start Time", "End Time", "Speaker", "Transcription"])


Once the transcriptions are complete, we remove the noise at the beginning of the transcription and identify the 2 most prominent speakers.

In [8]:
# Filter and label speakers in transcriptions
def filter_and_label_speakers(df):
    # In some studies, the RA says "You will have an 8 minute conversation/chat", we want to look for this and remove all of the transcriptions before it
    df['match_count'] = df['Transcription'].apply(lambda row: count_matches(row, ["8", "eight", "minute", "conversation", "chat"]))
    index_RA = df.index.get_loc(df['match_count'].idxmax())
    filtered_df = df.iloc[index_RA + 1:][["Start Time", "End Time", "Speaker", "Transcription"]] if df.iloc[index_RA]["Start Time"] / df.iloc[-1]["Start Time"] < 0.5 and df['match_count'].max() > 1 else df

    top_2_speakers = filtered_df['Speaker'].value_counts().nlargest(2).index.tolist()
    replace_map = {top_2_speakers[0]: "Speaker 1", top_2_speakers[1]: "Speaker 2"} if len(top_2_speakers) > 1 else {top_2_speakers[0]: "Speaker 1"}
    return filtered_df.replace({"Speaker": replace_map})

This function counts the instances of a list of words in the transcription lines. This is used to find the phrase "8 minute conversation", which is used to remove the beginning of the transcription that doesn't involve the participants.

In [9]:
def count_matches(row, words):
    return sum(word in row for word in words)

We then save the transcriptions after splitting the 2 most prominent speakers, which allow us to produce both the "dyad" files which include both speakers as well as "single" files which include only 1 speaker. We also include a "full" file which are the unfiltered transcriptions, this allows us to regenerate the "dyad" and "single" files without having to rerun the entire pipeline.

The method paramter selects whether the pipeline should treat the input file as having 2 speakers [method == "Dyad"], or only 1 speaker [method == "Single"].

In [11]:
def get_creation_time(file_path):
    '''
    Getting file Metadata if exists
    '''
    cmd = [
        'ffprobe',
        '-v', 'quiet',
        '-show_entries', 'format_tags=creation_time',
        '-of', 'default=noprint_wrappers=1:nokey=1',
        str(file_path)
    ]
    try:
        output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
        if output:
            output = output.replace("T", " ").split(".")[0]
            return output
    except subprocess.CalledProcessError:
        pass

    return None
    

In [12]:
def save_transcriptions(df, output_directory, file_path, method="dyad"):
    # Create required directories if they don't exist
    os.makedirs(os.path.join(output_directory, "dyad"), exist_ok=True)
    os.makedirs(os.path.join(output_directory, "single"), exist_ok=True)
    os.makedirs(os.path.join(output_directory, "full"), exist_ok=True)

    
    def format_duration(seconds):
        seconds = int(round(seconds))
        h = seconds // 3600
        m = (seconds % 3600) // 60
        s = seconds % 60
        return f"{h:02}:{m:02}:{s:02}"

    def seconds_to_hms(seconds):
        ms = int((seconds % 1) * 1000)
        h  = int(seconds // 3600)
        m  = int((seconds % 3600) // 60)
        s  = int(seconds % 60)

        return f"{h:02}:{m:02}:{s:02}.{ms:03}"

    def fill_meta_data():
        title = file_name

        creation_time = get_creation_time(file_path)
        if not creation_time:
            file_stats = file_path.stat()
            creation_time = datetime.datetime.fromtimestamp(file_stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')
            
        duration_sec = df["End Time"].max() - df["Start Time"].min()
        duration_str = format_duration(duration_sec)

        meta_lines = [
            f"# Title: {title}",
            f"# Date Modified: {creation_time}",
            f"# Duration: {duration_str}",
            f"# Number of Speakers: {df['Speaker'].nunique()}",
            f"# Number of Segments: {len(df)}",
            f"# Format: Start Time | End Time | Speaker | Transcription",
            f"# Time Format: hh:mm:ss.mmm (hours:minutes:seconds.milliseconds)",
            f"# ---------------------------------------------"
            "\n"
        ]

        return meta_lines

    def write_file(df_subset, out_path):
        with open(out_path, 'w', encoding='utf-8') as f:
            for line in meta_lines:
                f.write(line+'\n')
            df_subset.to_csv(f, sep='|', index=False, header=False)
    
    # Include meta data here
    file_name = file_path.stem
    meta_lines = fill_meta_data()

    df = df.copy()
    df["Start Time"] = df["Start Time"].apply(seconds_to_hms)
    df["End Time"]   = df["End Time"].apply(seconds_to_hms)
    
    if method == "dyad":
        write_file(
            df[(df["Speaker"] == "Speaker 1") | (df["Speaker"] == "Speaker 2")],
            os.path.join(output_directory, "dyad", file_name + '_dyad.txt')
        )
        write_file(
            df[df["Speaker"] == "Speaker 1"],
            os.path.join(output_directory, "single", file_name + '_single_X.txt')
        )
        write_file(
            df[df["Speaker"] == "Speaker 2"],
            os.path.join(output_directory, "single", file_name + '_single_Y.txt')
        )
    elif "single":
        write_file(
            df[df["Speaker"] == "Speaker 1"],
            os.path.join(output_directory, "single", file_name + '_single.txt')
        )


    write_file(
        df,
        os.path.join(output_directory, 'full', file_name + '_full.txt')
    )


This function runs the full pipeline on one file.

In [13]:
def process_single_video(file_path, output_directory, prob_threshold=0.9, method="dyad"):
    # Ensure file_path is a Path object
    file_path = Path(file_path)
    
    # Process a single video file through the pipeline
    print(f"Processing file: {file_path.stem}")
    workflow_start_time = time.time()
    
    # Step 1: Extract Audio
    output_audio = 'temp_extracted_audio.wav'
    extract_audio(file_path, output_audio)

    # Step 2: Diarize
    diarize_audio(output_audio)
    
    # Step 3: Get Speaker Segments
    audio_sections = get_speaker_segments('oracle_vad/pred_rttms/temp_extracted_audio.rttm')
    
    # Step 4: Transcribe Sections
    audio = AudioSegment.from_wav(output_audio)
    df = transcribe_sections(audio, audio_sections, prob_threshold)
    
    # Step 5: Filter and Label Speakers
    filtered_df = filter_and_label_speakers(df)
    
    # Step 6: Save Transcriptions
    save_transcriptions(filtered_df, output_directory, file_path, method)
    
    # Clean up temporary audio file
    # os.remove(output_audio) if os.path.exists(output_audio) else None
    print(f"Elapsed time: {round(time.time() - workflow_start_time, 2)} seconds")


### Below are functions used to navigate through directories and call process_single_video, you will likely have to modify it to suit your project structure.

In [14]:
def is_valid_filename(filename):
    return re.match(r"^[A-Z]{3,4}\d{2}.*\.mp4$", filename, re.IGNORECASE) is not None

def run_in_folder(input_directory, output_directory, threshold=0.9, method="dyad"):
    directory_path = Path(input_directory)
    output_directory_path = Path(output_directory)

    # Ensure required subdirectories exist
    for sub_folder in ["dyad", "full", "single"]:
        (output_directory_path / sub_folder).mkdir(parents=True, exist_ok=True)

    # Check for already processed files
    existing_files = [
        file.stem.split("_")[0]
        for sub_folder in ["dyad", "full", "single"]
        for file in (output_directory_path / sub_folder).glob("*")
    ]

    for entry in directory_path.iterdir():
        if entry.is_file() and entry.suffix.lower() == ".mp4" and is_valid_filename(entry.name):
            if entry.stem not in existing_files:
                process_single_video(entry, output_directory, threshold, method)

        elif entry.is_dir():
            for nested_file in entry.glob("*.mp4"):
                if is_valid_filename(nested_file.name):
                    if nested_file.stem not in existing_files:
                        process_single_video(nested_file, output_directory, threshold, method)



This function loops through a list of subfolders to execute the main pipeline.

In [15]:
# Process multiple folders
def run_all_folders(input_directory, output_directory, folders=["VTV", "FTF", "VGC"], methods=["dyad", "dyad", "dyad"], threshold=0.9):
    for folder, method in zip(folders, methods):
        run_in_folder(input_directory + "/" + folder + "/", output_directory + "/" + folder + "/", single=single, threshold=threshold, method=method)

Here are example executions of the pipeline.

In [16]:
process_single_video("./Data/VGC13A.MP4", "Test", prob_threshold=0.9, method="full")

Processing file: VGC13A


ffmpeg version 9c33b2f Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/home/gnamiro/miniconda3/envs/transcript --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-libx264 --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/pkg-config
  libavutil      56. 51.100 / 56. 51.100
  libavcodec     58. 91.100 / 58. 91.100
  libavformat    58. 45.100 / 58. 45.100
  libavdevice    58. 10.100 / 58. 10.100
  libavfilter     7. 85.100 /  7. 85.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  7.100 /  5.  7.100
  libswresample   3.

[NeMo I 2025-05-19 20:59:15 msdd_models:1120] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2025-05-19 20:59:15 cloud:58] Found existing object /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2025-05-19 20:59:15 cloud:64] Re-using file from: /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2025-05-19 20:59:15 common:826] Instantiating model from pre-trained checkpoint


size=   18650kB time=00:09:56.80 bitrate= 256.0kbits/s speed=2.04e+03x    
video:0kB audio:18650kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000408%
[NeMo W 2025-05-19 20:59:16 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2025-05-19 20:59:16 modelPT:187] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: nu

[NeMo I 2025-05-19 20:59:16 features:305] PADDING: 16
[NeMo I 2025-05-19 20:59:17 features:305] PADDING: 16
[NeMo I 2025-05-19 20:59:17 save_restore_connector:275] Model EncDecDiarLabelModel was successfully restored from /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2025-05-19 20:59:17 features:305] PADDING: 16
[NeMo I 2025-05-19 20:59:17 clustering_diarizer:117] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-05-19 20:59:17 cloud:58] Found existing object /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-05-19 20:59:17 cloud:64] Re-using file from: /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2025-05-19 20:59:17 common:826] Instantiating model from pre-trained checkpoint


[NeMo W 2025-05-19 20:59:17 classification_models:641] Please use the EncDecSpeakerLabelModel instead of this model. EncDecClassificationModel model is kept for backward compatibility with older models.
[NeMo W 2025-05-19 20:59:17 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_tr

[NeMo I 2025-05-19 20:59:17 features:305] PADDING: 16
[NeMo I 2025-05-19 20:59:17 save_restore_connector:275] Model EncDecClassificationModel was successfully restored from /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-05-19 20:59:17 msdd_models:892] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2025-05-19 20:59:17 msdd_models:893] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }


[NeMo W 2025-05-19 20:59:17 clustering_diarizer:398] Deleting previous clustering diarizer outputs.


[NeMo I 2025-05-19 20:59:17 speaker_utils:92] Number of files to diarize: 1
[NeMo I 2025-05-19 20:59:17 clustering_diarizer:303] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.50s/it]

[NeMo I 2025-05-19 20:59:43 vad_utils:146] The prepared manifest file exists. Overwriting!
[NeMo I 2025-05-19 20:59:43 classification_models:594] Perform streaming frame-level VAD
[NeMo I 2025-05-19 20:59:43 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 20:59:43 collections:880] Dataset successfully loaded with 12 items and total duration provided from manifest is  0.17 hours.
[NeMo I 2025-05-19 20:59:43 collections:886] # 12 files loaded accounting to # 1 labels



vad: 100%|██████████████████████████████████████████████████████████████████████████████| 12/12 [00:24<00:00,  2.04s/it]

[NeMo I 2025-05-19 20:59:43 clustering_diarizer:244] Generating predictions with overlapping input segments



                                                                                                                        

[NeMo I 2025-05-19 20:59:45 clustering_diarizer:256] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.35it/s]

[NeMo I 2025-05-19 20:59:45 clustering_diarizer:281] Subsegmentation for embedding extraction: scale0, oracle_vad/speaker_outputs/subsegments_scale0.json
[NeMo I 2025-05-19 20:59:45 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-19 20:59:45 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 20:59:45 collections:880] Dataset successfully loaded with 647 items and total duration provided from manifest is  0.25 hours.
[NeMo I 2025-05-19 20:59:45 collections:886] # 647 files loaded accounting to # 1 labels



[1/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 11/11 [04:00<00:00, 21.86s/it]

[NeMo I 2025-05-19 21:03:46 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-19 21:03:46 clustering_diarizer:281] Subsegmentation for embedding extraction: scale1, oracle_vad/speaker_outputs/subsegments_scale1.json
[NeMo I 2025-05-19 21:03:46 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-19 21:03:46 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 21:03:46 collections:880] Dataset successfully loaded with 778 items and total duration provided from manifest is  0.26 hours.
[NeMo I 2025-05-19 21:03:46 collections:886] # 778 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 13/13 [05:25<00:00, 25.01s/it]

[NeMo I 2025-05-19 21:09:11 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-19 21:09:11 clustering_diarizer:281] Subsegmentation for embedding extraction: scale2, oracle_vad/speaker_outputs/subsegments_scale2.json
[NeMo I 2025-05-19 21:09:11 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-19 21:09:11 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 21:09:11 collections:880] Dataset successfully loaded with 976 items and total duration provided from manifest is  0.26 hours.
[NeMo I 2025-05-19 21:09:11 collections:886] # 976 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 16/16 [04:55<00:00, 18.49s/it]

[NeMo I 2025-05-19 21:14:07 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-19 21:14:07 clustering_diarizer:281] Subsegmentation for embedding extraction: scale3, oracle_vad/speaker_outputs/subsegments_scale3.json
[NeMo I 2025-05-19 21:14:07 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-19 21:14:07 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 21:14:07 collections:880] Dataset successfully loaded with 1313 items and total duration provided from manifest is  0.27 hours.
[NeMo I 2025-05-19 21:14:07 collections:886] # 1313 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 21/21 [04:33<00:00, 13.03s/it]

[NeMo I 2025-05-19 21:18:41 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-19 21:18:41 clustering_diarizer:281] Subsegmentation for embedding extraction: scale4, oracle_vad/speaker_outputs/subsegments_scale4.json
[NeMo I 2025-05-19 21:18:41 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-19 21:18:41 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-19 21:18:41 collections:880] Dataset successfully loaded with 2000 items and total duration provided from manifest is  0.27 hours.
[NeMo I 2025-05-19 21:18:41 collections:886] # 2000 files loaded accounting to # 1 labels



[5/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 32/32 [06:34<00:00, 12.34s/it]

[NeMo I 2025-05-19 21:25:16 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings



[NeMo W 2025-05-19 21:25:16 speaker_utils:473] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.
clustering: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [-1:59:37<00:00, -0.04it/s]

[NeMo I 2025-05-19 21:24:52 clustering_diarizer:451] Outputs are saved in /home/gnamiro/GSGS/transcription/NemoWhisperTranscriptionPipeline/oracle_vad directory



[NeMo W 2025-05-19 21:24:52 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-19 21:24:52 msdd_models:988] Loading embedding pickle file of scale:0 at oracle_vad/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2025-05-19 21:24:52 msdd_models:988] Loading embedding pickle file of scale:1 at oracle_vad/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2025-05-19 21:24:52 msdd_models:988] Loading embedding pickle file of scale:2 at oracle_vad/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2025-05-19 21:24:52 msdd_models:988] Loading embedding pickle file of scale:3 at oracle_vad/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2025-05-19 21:24:52 msdd_models:988] Loading embedding pickle file of scale:4 at oracle_vad/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2025-05-19 21:24:52 msdd_models:966] Loading cluster label file from oracle_vad/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2025-05-19 21:24:52 collections:1212] Filtered dur

100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.85it/s]

[NeMo I 2025-05-19 21:24:52 msdd_models:1444]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2025-05-19 21:24:52 speaker_utils:92] Number of files to diarize: 1
[NeMo I 2025-05-19 21:24:52 speaker_utils:92] Number of files to diarize: 1



[NeMo W 2025-05-19 21:24:52 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-19 21:24:52 speaker_utils:92] Number of files to diarize: 1


[NeMo W 2025-05-19 21:24:52 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-19 21:24:52 speaker_utils:92] Number of files to diarize: 1


[NeMo W 2025-05-19 21:24:52 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-19 21:24:52 msdd_models:1473]   
    
Elapsed time: 1566.76 seconds


In [18]:
process_single_video("./Data/FtF32A.MP4", "Test", prob_threshold=0.9, method="single")

Processing file: FtF32A


ffmpeg version 9c33b2f Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/home/gnamiro/miniconda3/envs/transcript --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-libx264 --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/pkg-config
  libavutil      56. 51.100 / 56. 51.100
  libavcodec     58. 91.100 / 58. 91.100
  libavformat    58. 45.100 / 58. 45.100
  libavdevice    58. 10.100 / 58. 10.100
  libavfilter     7. 85.100 /  7. 85.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  7.100 /  5.  7.100
  libswresample   3.

100% [................................................................................] 7646 / 7646[NeMo I 2025-05-14 21:12:38 msdd_models:1120] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2025-05-14 21:12:38 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/diar_msdd_telephonic/versions/1.0.1/files/diar_msdd_telephonic.nemo to /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
100% [......................................................................] 107609008 / 107609008[NeMo I 2025-05-14 21:12:42 common:826] Instantiating model from pre-trained checkpoint


[NeMo W 2025-05-14 21:12:44 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2025-05-14 21:12:44 modelPT:187] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2025-05-14 21:12:44 modelPT:194] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2025-05-14 21:12:44 features:305] PADDING: 16
[NeMo I 2025-05-14 21:12:44 features:305] PADDING: 16
[NeMo I 2025-05-14 21:12:45 save_restore_connector:275] Model EncDecDiarLabelModel was successfully restored from /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2025-05-14 21:12:45 features:305] PADDING: 16
[NeMo I 2025-05-14 21:12:45 clustering_diarizer:117] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-05-14 21:12:45 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
100% [............................................................................] 501760 / 501760[NeMo I 2025-05-14 21:12:46 common:826] Instantiating model fro

[NeMo W 2025-05-14 21:12:46 classification_models:641] Please use the EncDecSpeakerLabelModel instead of this model. EncDecClassificationModel model is kept for backward compatibility with older models.
[NeMo W 2025-05-14 21:12:46 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_tr

[NeMo I 2025-05-14 21:12:46 features:305] PADDING: 16
[NeMo I 2025-05-14 21:12:46 save_restore_connector:275] Model EncDecClassificationModel was successfully restored from /home/gnamiro/.cache/torch/NeMo/NeMo_2.3.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-05-14 21:12:46 msdd_models:892] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2025-05-14 21:12:46 msdd_models:893] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2025-05-14 21:12:46 speaker_utils:92] Number of files to diarize: 1
[NeMo I 2025-05-14 21:12:46 clustering_diarizer:303] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.32s/it]

[NeMo I 2025-05-14 21:12:55 classification_models:594] Perform streaming frame-level VAD
[NeMo I 2025-05-14 21:12:55 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:12:55 collections:880] Dataset successfully loaded with 13 items and total duration provided from manifest is  0.18 hours.
[NeMo I 2025-05-14 21:12:55 collections:886] # 13 files loaded accounting to # 1 labels



vad: 100%|██████████████████████████████████████████████████████████████████████████████| 13/13 [00:27<00:00,  2.11s/it]

[NeMo I 2025-05-14 21:13:23 clustering_diarizer:244] Generating predictions with overlapping input segments



                                                                                                                        

[NeMo I 2025-05-14 21:13:26 clustering_diarizer:256] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.04it/s]

[NeMo I 2025-05-14 21:13:26 clustering_diarizer:281] Subsegmentation for embedding extraction: scale0, oracle_vad/speaker_outputs/subsegments_scale0.json
[NeMo I 2025-05-14 21:13:26 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-14 21:13:26 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:13:26 collections:880] Dataset successfully loaded with 559 items and total duration provided from manifest is  0.18 hours.
[NeMo I 2025-05-14 21:13:26 collections:886] # 559 files loaded accounting to # 1 labels



[1/5] extract embeddings: 100%|███████████████████████████████████████████████████████████| 9/9 [03:42<00:00, 24.74s/it]

[NeMo I 2025-05-14 21:17:09 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-14 21:17:09 clustering_diarizer:281] Subsegmentation for embedding extraction: scale1, oracle_vad/speaker_outputs/subsegments_scale1.json
[NeMo I 2025-05-14 21:17:09 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-14 21:17:09 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:17:09 collections:880] Dataset successfully loaded with 643 items and total duration provided from manifest is  0.18 hours.
[NeMo I 2025-05-14 21:17:09 collections:886] # 643 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 11/11 [05:08<00:00, 28.01s/it]

[NeMo I 2025-05-14 21:22:17 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-14 21:22:17 clustering_diarizer:281] Subsegmentation for embedding extraction: scale2, oracle_vad/speaker_outputs/subsegments_scale2.json
[NeMo I 2025-05-14 21:22:17 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-14 21:22:17 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:22:17 collections:880] Dataset successfully loaded with 795 items and total duration provided from manifest is  0.19 hours.
[NeMo I 2025-05-14 21:22:17 collections:886] # 795 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 13/13 [04:10<00:00, 19.23s/it]

[NeMo I 2025-05-14 21:26:27 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-14 21:26:27 clustering_diarizer:281] Subsegmentation for embedding extraction: scale3, oracle_vad/speaker_outputs/subsegments_scale3.json
[NeMo I 2025-05-14 21:26:27 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-14 21:26:27 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:26:27 collections:880] Dataset successfully loaded with 1054 items and total duration provided from manifest is  0.20 hours.
[NeMo I 2025-05-14 21:26:27 collections:886] # 1054 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 17/17 [04:15<00:00, 15.03s/it]

[NeMo I 2025-05-14 21:30:43 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings
[NeMo I 2025-05-14 21:30:43 clustering_diarizer:281] Subsegmentation for embedding extraction: scale4, oracle_vad/speaker_outputs/subsegments_scale4.json
[NeMo I 2025-05-14 21:30:43 clustering_diarizer:337] Extracting embeddings for Diarization
[NeMo I 2025-05-14 21:30:43 collections:879] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-05-14 21:30:43 collections:880] Dataset successfully loaded with 1609 items and total duration provided from manifest is  0.21 hours.
[NeMo I 2025-05-14 21:30:43 collections:886] # 1609 files loaded accounting to # 1 labels



[5/5] extract embeddings: 100%|█████████████████████████████████████████████████████████| 26/26 [04:59<00:00, 11.53s/it]

[NeMo I 2025-05-14 21:35:42 clustering_diarizer:383] Saved embedding files to oracle_vad/speaker_outputs/embeddings



[NeMo W 2025-05-14 21:35:42 speaker_utils:473] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.
clustering: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]

[NeMo I 2025-05-14 21:35:43 clustering_diarizer:451] Outputs are saved in /home/gnamiro/GSGS/transcription/NemoWhisperTranscriptionPipeline/oracle_vad directory



[NeMo W 2025-05-14 21:35:43 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-14 21:35:43 msdd_models:988] Loading embedding pickle file of scale:0 at oracle_vad/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2025-05-14 21:35:43 msdd_models:988] Loading embedding pickle file of scale:1 at oracle_vad/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2025-05-14 21:35:43 msdd_models:988] Loading embedding pickle file of scale:2 at oracle_vad/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2025-05-14 21:35:43 msdd_models:988] Loading embedding pickle file of scale:3 at oracle_vad/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2025-05-14 21:35:43 msdd_models:988] Loading embedding pickle file of scale:4 at oracle_vad/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2025-05-14 21:35:43 msdd_models:966] Loading cluster label file from oracle_vad/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2025-05-14 21:35:43 collections:1212] Filtered dur

100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.92it/s]

[NeMo I 2025-05-14 21:35:44 msdd_models:1444]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2025-05-14 21:35:44 speaker_utils:92] Number of files to diarize: 1
[NeMo I 2025-05-14 21:35:44 speaker_utils:92] Number of files to diarize: 1



[NeMo W 2025-05-14 21:35:44 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-14 21:35:44 speaker_utils:92] Number of files to diarize: 1


[NeMo W 2025-05-14 21:35:44 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-14 21:35:44 speaker_utils:92] Number of files to diarize: 1


[NeMo W 2025-05-14 21:35:44 der:217] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-05-14 21:35:44 msdd_models:1473]   
    
Elapsed time: 1432.97 seconds


In [None]:
process_single_video("NATA02R.mp4", "Test", prob_threshold=0.9, method="single")

In [None]:
input_directory = "/mnt/d/RICKY NEW DECEPTION"

output_directory = "Transcripts/NEW DECEPTION"

run_all_folders(input_directory, output_directory, folders=["NATA##", "NFTF##", "NVTV##"], methods=["dyad", "dyad", "dyad"], threshold=0.9)

In [None]:
input_directory = "/mnt/d/RICKY NEW TRUST"

output_directory = "Transcripts/NEW TRUST"

run_all_folders(input_directory, output_directory, folders=["FTF##", "VTV##"], methods=["dyad", "dyad"], threshold=0.9)