## Install Dependencies

In [23]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

100%|██████████| 1.83M/1.83M [00:00<00:00, 2.15MB/s]


In [24]:
USE_PIP = True # download model using pip package or torch.hub
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
if USE_PIP:
  !pip install -q silero-vad
  from silero_vad import (load_silero_vad,
                          read_audio,
                          get_speech_timestamps,
                          save_audio,
                          VADIterator,
                          collect_chunks)
  model = load_silero_vad(onnx=USE_ONNX)
else:
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)

  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

## Speech timestapms from full audio

In [33]:
import pandas as pd
import numpy as np
import os
import torch
import torchaudio
from glob import glob
from pprint import pprint
from tqdm.auto import tqdm

def process_dataset(dataset_path, output_file):
    SAMPLING_RATE = 16000
    """
    Process all audio files in the dataset directory and save timestamps to a text file.
    
    Args:
        dataset_path: Path to the directory containing audio files
        output_file: Path to the output text file
    """
    # Get all audio files in the dataset directory
    audio_files = []
    for ext in ['*.wav', '*.mp3', '*.flac', '*.ogg']:
        audio_files.extend(glob(os.path.join(dataset_path, ext)))
    
    print(f"Found {len(audio_files)} audio files in {dataset_path}")
    
    # Create or open the output file
    with open(output_file, 'w') as f:
        # Write header
        f.write("ID,timestamp_start,timestamp_stop\n")
        
        # Process each audio file
        for audio_file in audio_files:
            # Get the recording ID (filename without extension)
            file_name = os.path.basename(audio_file)
            ID = os.path.splitext(file_name)[0]
            print(f"Processing {ID}...")
            
            # Check if this is a "_non_speech" file - if so, skip VAD processing
            if "_non_speech" in ID:
                print(f"Detected non-speech file {ID}, skipping VAD processing")
                continue
                
            # Check if this is a "_speech" file - if so, process the whole file as speech
            if "_speech" in ID:
                try:
                    # Read the audio file to get duration
                    wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)
                    duration_frames = len(wav)
                    
                    # Write the entire file as speech
                    start_time = 0
                    end_time = duration_frames / SAMPLING_RATE
                    f.write(f"{ID},{start_time:.3f},{end_time:.3f}\n")
                    print(f"Marked entire file {ID} as speech")
                    continue
                except Exception as e:
                    print(f"Error processing {ID}: {e}")
                    continue
            
            try:
                # Read the audio file
                wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)
                
                # Get speech timestamps
                speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
                
                # Debug: Print timestamps
                print(f"Found {len(speech_timestamps)} speech segments in {ID}")
                
                # Write timestamps to file
                for ts in speech_timestamps:
                    # Convert frame indices to seconds
                    start_time = ts['start'] / SAMPLING_RATE
                    end_time = ts['end'] / SAMPLING_RATE
                    
                    # Write to file
                    f.write(f"{ID},{start_time:.3f},{end_time:.3f}\n")
                    
            except Exception as e:
                print(f"Error processing {ID}: {e}")
    
    print(f"Processing complete. Results saved to {output_file}")

def label_speech_windows(features_csv_path, timestamps_txt_path, output_csv_path=None, window_size=0.025, overlap=0.6):
    """
    Label each window in the features CSV as speech (1) or non-speech (0) based on timestamps.
    
    Args:
        features_csv_path: Path to CSV file containing features for each window
        timestamps_txt_path: Path to text file containing speech timestamps
        output_csv_path: Path to save the output CSV file (default: append '_labeled' to input filename)
        window_size: Window size in seconds (default: 0.025s = 25ms)
        overlap: Overlap between windows (default: 0.6)
    
    Returns:
        DataFrame with an additional 'speech_label' column
    """
    # Set default output path if not provided
    if output_csv_path is None:
        base_name = os.path.splitext(features_csv_path)[0]
        output_csv_path = f"{base_name}_labeled.csv"
    
    print(f"Loading features from {features_csv_path}...")
    # Load features CSV
    features_df = pd.read_csv(features_csv_path)
    
    print(f"Loading timestamps from {timestamps_txt_path}...")
    # Load timestamps
    if os.path.exists(timestamps_txt_path) and os.path.getsize(timestamps_txt_path) > 0:
        timestamps_df = pd.read_csv(timestamps_txt_path)
        timestamp_groups = timestamps_df.groupby('ID')
    else:
        print("Warning: Timestamps file is empty or does not exist. All windows will be labeled as non-speech.")
        timestamps_df = pd.DataFrame(columns=['ID', 'timestamp_start', 'timestamp_stop'])
        timestamp_groups = pd.DataFrame().groupby('ID')  # Empty group
    
    # Calculate the step size between windows
    step_size = window_size * (1 - overlap)
    
    # Initialize speech labels with zeros (non-speech)
    speech_labels = np.zeros(len(features_df), dtype=int)
    
    # Add a new column for window times if it doesn't exist
    if 'window_time' not in features_df.columns:
        # Assume windows are ordered sequentially within each recording
        # Create a temporary column for window indices
        features_df['window_idx'] = features_df.groupby('ID').cumcount()
        # Calculate window time (start time of each window)
        features_df['window_time'] = features_df['window_idx'] * step_size
        # Remove temporary column
        features_df.drop('window_idx', axis=1, inplace=True)
    
    print("Labeling windows...")
    # For each recording in the features DataFrame
    for ID, group in tqdm(features_df.groupby('ID')):
        # Get indices in the original DataFrame
        indices = group.index
        
        # Automatically label based on filename pattern
        if "_non_speech" in ID:
            # Mark all windows in this recording as non-speech (0)
            speech_labels[indices] = 0
            print(f"Auto-labeled all windows in {ID} as non-speech based on filename pattern")
            continue
        elif "_speech" in ID:
            # Mark all windows in this recording as speech (1)
            speech_labels[indices] = 1
            print(f"Auto-labeled all windows in {ID} as speech based on filename pattern")
            continue
        
        # Skip if this recording is not in the timestamps file and apply default non-speech label
        if ID not in timestamp_groups.groups:
            print(f"No timestamps found for recording {ID}, labeling as non-speech")
            speech_labels[indices] = 0
            continue
        
        # Get timestamps for this recording
        recording_timestamps = timestamp_groups.get_group(ID)
        
        # For each window in this recording
        for i, (idx, row) in enumerate(group.iterrows()):
            window_start = row['window_time']
            window_end = window_start + window_size
            
            # Check if this window overlaps with any speech segment
            for _, ts_row in recording_timestamps.iterrows():
                ts_start = ts_row['timestamp_start']
                ts_end = ts_row['timestamp_stop']
                
                # Check for overlap
                if max(window_start, ts_start) < min(window_end, ts_end):
                    speech_labels[idx] = 1
                    break  # No need to check other timestamps for this window
    
    # Add speech labels to the DataFrame
    features_df['speech_label'] = speech_labels
    
    # Save the result
    print(f"Saving labeled features to {output_csv_path}...")
    features_df.to_csv(output_csv_path, index=False)
    
    print(f"Done! Added speech labels for {sum(speech_labels)} windows out of {len(speech_labels)} total.")
    
    return features_df

# Example usage
if __name__ == "__main__":
    
    dataset_path = "C://Users//costac_c//Documents//GitHub//VAD//database//timit"
    output_file = "speech_timestamps.txt"

    process_dataset(dataset_path, output_file)

    features_csv_path = "C://Users//costac_c//Documents//GitHub//VAD//database//features//16000_25_0.6_hamming_none_MAV_ZCR_F0_TTP_EQClasses.csv"
    timestamps_txt_path = "C://Users//costac_c//Documents//GitHub//VAD//silero-vad//speech_timestamps.txt"
    
    # Run the labeling
    labeled_df = label_speech_windows(
        features_csv_path=features_csv_path,
        timestamps_txt_path=timestamps_txt_path,
        window_size=0.025,  # 25ms
        overlap=0.6
    )
    
    # Print some statistics
    print(f"Speech windows: {labeled_df['speech_label'].sum()}")
    print(f"Non-speech windows: {len(labeled_df) - labeled_df['speech_label'].sum()}")
    print(f"Speech percentage: {labeled_df['speech_label'].mean() * 100:.2f}%")

Found 256 audio files in C://Users//costac_c//Documents//GitHub//VAD//database//timit
Processing 100_non_speech...
Detected non-speech file 100_non_speech, skipping VAD processing
Processing 100_speech...
Marked entire file 100_speech as speech
Processing 101_non_speech...
Detected non-speech file 101_non_speech, skipping VAD processing
Processing 101_speech...
Marked entire file 101_speech as speech
Processing 102_non_speech...
Detected non-speech file 102_non_speech, skipping VAD processing
Processing 102_speech...
Marked entire file 102_speech as speech
Processing 103_non_speech...
Detected non-speech file 103_non_speech, skipping VAD processing
Processing 103_speech...
Marked entire file 103_speech as speech
Processing 104_non_speech...
Detected non-speech file 104_non_speech, skipping VAD processing
Processing 104_speech...
Marked entire file 104_speech as speech
Processing 105_non_speech...
Detected non-speech file 105_non_speech, skipping VAD processing
Processing 105_speech...


100%|██████████| 256/256 [00:00<00:00, 27756.02it/s]

Auto-labeled all windows in 100_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 100_speech as speech based on filename pattern
Auto-labeled all windows in 101_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 101_speech as speech based on filename pattern
Auto-labeled all windows in 102_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 102_speech as speech based on filename pattern
Auto-labeled all windows in 103_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 103_speech as speech based on filename pattern
Auto-labeled all windows in 104_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 104_speech as speech based on filename pattern
Auto-labeled all windows in 105_non_speech as non-speech based on filename pattern
Auto-labeled all windows in 105_speech as speech based on filename pattern
Auto-labeled all windows in 106_non_speech as non-sp




Done! Added speech labels for 37984 windows out of 80083 total.
Speech windows: 37984
Non-speech windows: 42099
Speech percentage: 47.43%


In [9]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

NameError: name 'speech_timestamps' is not defined

## Entire audio inference

In [None]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)

## Stream imitation example

In [17]:
## using VADIterator class

vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)

window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 0.0} {'end': 2.1} {'start': 2.7} {'end': 4.9} {'start': 5.0} {'end': 6.8} {'start': 9.3} {'end': 13.4} {'start': 13.5} {'end': 15.2} {'start': 15.3} {'end': 15.8} {'start': 16.3} {'end': 17.9} {'start': 18.4} {'end': 19.6} {'start': 20.3} {'end': 37.6} {'start': 38.0} {'end': 38.9} {'start': 39.9} {'end': 43.3} {'start': 43.6} {'end': 44.6} {'start': 45.0} {'end': 46.8} {'start': 48.8} {'end': 50.0} {'start': 51.1} {'end': 53.4} {'start': 53.5} 

In [None]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.46508005261421204, 0.7383556962013245, 0.8762860894203186, 0.9573900699615479, 0.9656304121017456, 0.9954002499580383, 0.9969189167022705, 0.9968834519386292, 0.9967656135559082, 0.9967684745788574]
