## Install Dependencies

In [1]:
#@title Install and Import Dependencies


SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example


In [2]:

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)

(get_speech_timestamps,
save_audio,
read_audio,
VADIterator,
collect_chunks) = utils

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/vscode/.cache/torch/hub/master.zip


## Speech timestapms from full audio

In [3]:
wav = read_audio('/workspace/1010_10.10.2024-h14m24.mp3', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

[{'end': 47584, 'start': 16416},
 {'end': 152544, 'start': 67616},
 {'end': 199136, 'start': 155168},
 {'end': 222688, 'start': 202784},
 {'end': 263648, 'start': 239136},
 {'end': 370656, 'start': 265248},
 {'end': 397792, 'start': 372256},
 {'end': 423904, 'start': 409632},
 {'end': 460768, 'start': 433184},
 {'end': 543712, 'start': 462368},
 {'end': 556512, 'start': 546336},
 {'end': 586208, 'start': 559136},
 {'end': 652768, 'start': 588832},
 {'end': 733152, 'start': 658464},
 {'end': 775648, 'start': 747552},
 {'end': 799200, 'start': 781344},
 {'end': 817120, 'start': 803360},
 {'end': 964064, 'start': 845344},
 {'end': 1060320, 'start': 966176},
 {'end': 1086432, 'start': 1066528},
 {'end': 1105888, 'start': 1094688},
 {'end': 1180128, 'start': 1135136},
 {'end': 1235936, 'start': 1193504},
 {'end': 1260512, 'start': 1246240},
 {'end': 1380832, 'start': 1262112},
 {'end': 1447392, 'start': 1404960},
 {'end': 1474016, 'start': 1450016},
 {'end': 1551840, 'start': 1485856},
 {'e

In [4]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE)
Audio('only_speech.wav')

## Entire audio inference

In [5]:
wav = read_audio('/workspace/1010_10.10.2024-h14m24.mp3', sampling_rate=SAMPLING_RATE)
# audio is being splitted into 31.25 ms long pieces
# so output length equals ceil(input_length * 31.25 / SAMPLING_RATE)
predicts = model.audio_forward(wav, sr=SAMPLING_RATE)

## Stream imitation example

In [6]:
## using VADIterator class

vad_iterator = VADIterator(model, sampling_rate=SAMPLING_RATE)
wav = read_audio('/workspace/1010_10.10.2024-h14m24.mp3', sampling_rate=SAMPLING_RATE)

window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_dict = vad_iterator(chunk, return_seconds=True)
    if speech_dict:
        print(speech_dict, end=' ')
vad_iterator.reset_states() # reset model states after each audio

{'start': 1.0} {'end': 3.0} {'start': 3.7} {'end': 3.9} {'start': 4.2} {'end': 9.5} {'start': 9.7} {'end': 12.4} {'start': 12.7} {'end': 13.9} {'start': 14.9} {'end': 16.5} {'start': 16.6} {'end': 23.2} {'start': 23.3} {'end': 24.9} {'start': 25.6} {'end': 26.5} {'start': 27.1} {'end': 28.8} {'start': 28.9} {'end': 34.0} {'start': 34.1} {'end': 34.8} {'start': 34.9} {'end': 36.6} {'start': 36.8} {'end': 40.8} {'start': 41.2} {'end': 45.8} {'start': 46.7} {'end': 48.5} {'start': 48.8} {'end': 50.0} {'start': 50.2} {'end': 51.1} {'start': 51.6} {'end': 51.7} {'start': 52.8} {'end': 60.3} {'start': 60.4} {'end': 66.3} {'start': 66.7} {'end': 67.9} {'start': 68.4} {'end': 69.1} {'start': 69.6} {'end': 69.8} {'start': 70.9} {'end': 73.8} {'start': 74.2} {'end': 74.5} {'start': 74.6} {'end': 77.2} {'start': 77.9} {'end': 78.8} {'start': 78.9} {'end': 86.3} {'start': 86.4} {'end': 86.5} {'start': 87.8} {'end': 90.5} {'start': 90.6} {'end': 92.1} {'start': 92.9} {'end': 97.0} {'start': 97.2} {

In [7]:
## just probabilities

wav = read_audio('/workspace/1010_10.10.2024-h14m24.mp3', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 512 if SAMPLING_RATE == 16000 else 256
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

[0.01198811549693346, 0.1308462768793106, 0.11119253933429718, 0.33252203464508057, 0.28012147545814514, 0.16364292800426483, 0.1234249621629715, 0.08391903340816498, 0.037211060523986816, 0.012975619174540043]


In [6]:
from awss.streaming.silero_vad_model import SileroVAD

In [4]:
%pip install pydub


import librosa
import numpy as np
from pydub import AudioSegment

# Load audio using pydub
audio = AudioSegment.from_mp3('/resources/6707f1c45b27aaa595f0b9c1.txt.mp3')
audio = audio.set_channels(1)  # Convert stereo to mono channel

# Extract raw audio data and sample rate
audio_frames = np.array(audio.get_array_of_samples())
original_sample_rate = audio.frame_rate

SAMPLING_RATE = 16000
# Define the target sample rate
target_sample_rate = 16000  # Replace with your desired sample rate

# Resample the audio using librosa
audio_resampled = librosa.resample(audio_frames.astype(np.float32), orig_sr=original_sample_rate, target_sr=target_sample_rate)

print(f"Original sample rate: {original_sample_rate} Hz")
print(f"Resampled to: {target_sample_rate} Hz")

frame_duration_ms = 32
frame_size = int(SAMPLING_RATE * frame_duration_ms / 1000)

# Chunk the audio frames into frame chunks of frame_size
frames = [audio_frames[i:i + frame_size] for i in range(0, len(audio_frames), frame_size)]
print(f"Total number of frames: {len(frames)}")


Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: Loading egg at /usr/local/lib/python3.11/dist-packages/pip-23.3.2-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Note: you may need to restart the kernel to use updated packages.
Original sample rate: 48000 Hz
Resampled to: 16000 Hz
Total number of frames: 9088


In [7]:
silero_vad = SileroVAD(intensity=2, original_sr=SAMPLING_RATE)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/vscode/.cache/torch/hub/master.zip


In [38]:
speech_probs = [
    silero_vad.user_is_speaking_with_proba(frame) for frame in frames
]

In [10]:
import torchaudio
path = '/resources/6707f1c45b27aaa595f0b9c1.txt.mp3'

wav, sr = torchaudio.load(path)
sampling_rate= SAMPLING_RATE

if wav.size(0) > 1:
    wav = wav.mean(dim=0, keepdim=True)

if sr != sampling_rate:
    transform = torchaudio.transforms.Resample(orig_freq=sr,
                                                new_freq=sampling_rate)
    wav = transform(wav)
    sr = sampling_rate

In [12]:
print(wav[0][512*4:512*5][:10])

tensor([ 0.0058, -0.0042, -0.0056,  0.0106,  0.0126,  0.0003,  0.0006,  0.0030,
         0.0094, -0.0108])


In [13]:
frames[4][:10]

array([-72, -85, -69, -46, -24,  -4,   0, -11, -20, -28], dtype=int16)

In [14]:
aux = frames[4].astype(np.float32)/ 32768.0
aux = aux.reshape(1, -1)

In [15]:
aux[0][:10]

array([-0.00219727, -0.00259399, -0.00210571, -0.00140381, -0.00073242,
       -0.00012207,  0.        , -0.00033569, -0.00061035, -0.00085449],
      dtype=float32)

In [None]:
# audio_float = frames[4].astype(np.float32)
# Convert to float32 and normalize

audio_float = frames[4].astype(np.float32) / 32768.0

# Create a PyTorch tensor
audio_tensor = torch.from_numpy(audio_float)
print(audio_tensor[:10])
silero_vad.model(audio_tensor.to("cuda"), SAMPLING_RATE).item()

tensor([-0.0222, -0.0340, -0.0438, -0.0531, -0.0627, -0.0691, -0.0749, -0.0825,
        -0.0875, -0.0913])


0.001668859156779945

In [40]:

audio_tensor = torch.from_numpy(frames[3]).to(
                torch.float32
).to("cuda")  # Explicitly use float32
silero_vad.model(audio_tensor, SAMPLING_RATE).item()

0.0005708851385861635

In [41]:
speech_probs[:10]

[0.00015865043678786606,
 0.0001076274347724393,
 7.50152175896801e-05,
 0.000207205128390342,
 0.000909307156689465,
 6.539556488860399e-05,
 6.333770579658449e-05,
 8.273428829852492e-05,
 3.710254532052204e-05,
 3.828405533568002e-05]

In [16]:
sampling_rate: int = 16000
min_speech_duration_ms: int = 250
max_speech_duration_s: float = float('inf')
min_silence_duration_ms: int = 100
speech_pad_ms: int = 30
return_seconds: bool = False
visualize_probs: bool = False
progress_tracking_callback = None
neg_threshold: float = None
window_size_samples: int = 512

min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000

audio_length_samples = len(audio_resampled)


triggered = False
speeches = []
current_speech = {}

threshold = 0.5
neg_threshold = max(threshold - 0.15, 0.01)
temp_end = 0  # to save potential segment end (and tolerate some silence)
prev_end = next_start = 0  # to save potential segment limits in case of maximum segment size reached

for i, speech_prob in enumerate(speech_probs):
    if (speech_prob >= threshold) and temp_end:
        temp_end = 0
        if next_start < prev_end:
            next_start = window_size_samples * i

    if (speech_prob >= threshold) and not triggered:
        triggered = True
        current_speech['start'] = window_size_samples * i
        continue

    if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
        if prev_end:
            current_speech['end'] = prev_end
            speeches.append(current_speech)
            current_speech = {}
            if next_start < prev_end:  # previously reached silence (< neg_thres) and is still not speech (< thres)
                triggered = False
            else:
                current_speech['start'] = next_start
            prev_end = next_start = temp_end = 0
        else:
            current_speech['end'] = window_size_samples * i
            speeches.append(current_speech)
            current_speech = {}
            prev_end = next_start = temp_end = 0
            triggered = False
            continue

    if (speech_prob < neg_threshold) and triggered:
        if not temp_end:
            temp_end = window_size_samples * i
        if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech:  # condition to avoid cutting in very short silence
            prev_end = temp_end
        if (window_size_samples * i) - temp_end < min_silence_samples:
            continue
        else:
            current_speech['end'] = temp_end
            if (current_speech['end'] - current_speech['start']) > min_speech_samples:
                speeches.append(current_speech)
            current_speech = {}
            prev_end = next_start = temp_end = 0
            triggered = False
            continue

if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
    current_speech['end'] = audio_length_samples
    speeches.append(current_speech)

for i, speech in enumerate(speeches):
    if i == 0:
        speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
    if i != len(speeches) - 1:
        silence_duration = speeches[i+1]['start'] - speech['end']
        if silence_duration < 2 * speech_pad_samples:
            speech['end'] += int(silence_duration // 2)
            speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
        else:
            speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
            speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
    else:
        speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))


In [17]:
speeches

[]

In [10]:
wav[0][512*4:512*5][:10]

tensor([ 1.7287e-04, -1.7835e-03, -5.6436e-05, -9.1720e-04,  1.0588e-03,
         4.2417e-03,  2.8809e-03, -7.5941e-04, -4.6079e-04,  1.3603e-03])

In [52]:
import numpy as np
import librosa
from pydub import AudioSegment

# Load audio using Pydub
audio = AudioSegment.from_mp3('/resources/6707f1c45b27aaa595f0b9c1.txt.mp3')

# Convert to mono channel
audio = audio.set_channels(1)

# Extract raw audio data and sample rate
audio_frames = np.array(audio.get_array_of_samples())
original_sample_rate = audio.frame_rate

# Normalize audio frames to match torchaudio
audio_frames = audio_frames.astype(np.float32) / 32768.0  # Scale to [-1.0, 1.0]

# Resample the audio to match the Torchaudio approach
target_sample_rate = 16000  # Replace with SAMPLING_RATE from the Torchaudio code
audio_resampled = librosa.resample(audio_resampled, orig_sr=original_sample_rate, target_sr=target_sample_rate)


# Ensure audio is reshaped to a 1D array matching torchaudio's format
audio_resampled = audio_resampled.reshape(1, -1)  # Add the batch dimension

# Print the desired chunk of audio to match torchaudio output
frame_size = 512  # Frame size from Torchaudio slicing
print(audio_resampled[0][frame_size * 4:frame_size * 5][:10])


[-6.1750105e-10 -1.7179341e-09 -6.1299199e-10  3.1627811e-10
 -2.2624327e-09 -1.4757580e-09  4.5234316e-10 -5.6352728e-10
 -3.2302708e-10 -7.3903139e-10]


In [49]:
frames = [audio_resampled[0][i:i + frame_size] for i in range(0, len(audio_resampled[0]), frame_size)]

In [35]:
vad_model = SileroVAD(intensity=2, original_sr=SAMPLING_RATE)

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/vscode/.cache/torch/hub/master.zip


In [None]:
vad_model.user_is_speaking_with_proba()

In [50]:

audio_tensor = torch.from_numpy(frames[60])
audio_tensor = audio_tensor.to("cuda")  # Explicitly use float32
vad_model.model(audio_tensor, SAMPLING_RATE).item()

0.0031272172927856445

In [27]:
torch.from_numpy(frames[4])

tensor([ 5.5425e-03, -3.8457e-03, -5.9819e-03,  1.0917e-02,  1.2423e-02,
         4.7711e-04,  4.6549e-04,  3.1447e-03,  9.3049e-03, -1.0746e-02,
        -1.6108e-02,  5.6334e-03,  9.1409e-03, -9.1593e-03, -1.1525e-02,
         1.2044e-02,  2.3230e-03, -4.6665e-03,  9.5738e-03,  5.7726e-03,
        -3.7051e-05, -6.2045e-03, -6.4757e-03,  3.9686e-03, -8.1978e-03,
        -9.7595e-03,  3.0410e-03,  8.3605e-03,  8.4573e-03, -1.0593e-03,
         7.7998e-03,  5.9764e-03, -3.5443e-03,  3.4980e-03,  2.3120e-03,
         1.9172e-03,  1.0870e-02,  1.9365e-03, -5.1783e-03, -2.0319e-03,
         5.3008e-04,  5.8102e-04, -2.0739e-03,  1.2213e-03, -3.5242e-03,
         5.2032e-03,  5.0917e-03, -2.8442e-03, -2.4180e-03, -7.6072e-03,
         2.5869e-03,  6.5447e-03, -2.4601e-03, -5.6313e-03, -7.3503e-03,
        -1.2352e-03, -7.0723e-03, -8.8909e-03,  9.4379e-03, -1.1949e-03,
        -2.2533e-03,  6.3010e-03, -1.1567e-03, -8.7884e-03, -1.2681e-02,
        -5.8523e-03, -4.4734e-04,  2.8484e-03, -6.4