## Dependencies

In [1]:
# %pip install numpy
# %pip install torch
# %pip install matplotlib
# %pip install torchaudio
# %pip install soundfile
# %pip install pyaudio
# %pip install jupyterplot


## Imports

In [2]:
import time
import torch
torch.set_num_threads(1)
import pyaudio
import threading
import numpy as np
from jupyterplot import ProgressPlot
from IPython.display import Audio, display


In [3]:
model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=True
)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /Users/devasheesh/.cache/torch/hub/master.zip


## Helper Methods


In [4]:
def get_probs(model, inputs: torch.Tensor, sample_rate: int):
    with torch.no_grad():
        outs = model(inputs, sample_rate)
    return outs.item()

def int2float(sound):
    abs_max = np.abs(sound).max()
    sound = sound.astype('float32')
    if abs_max > 0:
        sound *= 1/32768
    sound = sound.squeeze()  # depends on the use case
    return sound


## Pyaudio Set-up


In [20]:
FORMAT = pyaudio.paInt16
CHANNELS = 1

# how many audio samples per 1s of audio
SAMPLE_RATE = 16000

# 
FRAMES_PER_BUFFER = int(SAMPLE_RATE / 10)

# how many samples VAD reads from the pyaudio stream per iteration
# 256 for 8kHZ, 512 for 16kHZ
VAD_NUM_SAMPLES = 512
VAD_THRESHOLD = 0.85

# in the final recorded clips, how many seconds of audio before and after the detected speech should be included
BUFFER_SECONDS_BEFORE = 0.5
BUFFER_SECONDS_AFTER = 1.0

BUFFER_BEFORE_SIZE = int(SAMPLE_RATE * BUFFER_SECONDS_BEFORE)
BUFFER_AFTER_SIZE = int(SAMPLE_RATE * BUFFER_SECONDS_AFTER)

audio = pyaudio.PyAudio()


## Real Time Visualization


As an enhancement to plot the speech probabilities in real time I added the implementation below. In contrast to the simeple one, it records the audio until to stop the recording by pressing enter. While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.


In [18]:
audio_clips = []    # Stores the audio clips

def start_recording():
    stream = audio.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=FRAMES_PER_BUFFER
    )

    # it stores y-axis values for the progress plot
    voiced_confidences = []
    
    current_clip = []  # Stores the ongoing audio clip
    pre_buffer = []  # Buffer to store audio before VAD crosses threshold
    post_buffer = []  # Buffer to capture audio after threshold is crossed

    pp = ProgressPlot(
        plot_names=["Silero VAD", "Audio Clips Recording"],
        line_names=["speech probabilities", "threshold"],
        line_colors=["blue", "green"],
        x_label="time (s)",
        x_iterator=False, # Manually control x-axis (to update time in seconds)
        width=1000,
        y_lim=[0, 1]
    )
    
    start_time = time.time()
    vad_triggered = False  # Tracks whether VAD has been triggered

    while True:
        try:
            audio_chunk = stream.read(VAD_NUM_SAMPLES)

            # Convert audio to appropriate format
            audio_int16 = np.frombuffer(audio_chunk, np.int16)
            audio_float32 = int2float(audio_int16)
            audio_tensor = torch.from_numpy(audio_float32)
        
            # Get VAD confidence
            new_confidence = get_probs(model, audio_tensor, SAMPLE_RATE)
            voiced_confidences.append(new_confidence)
        
            # Update plot with time in seconds
            elapsed_time = time.time() - start_time
            pp.update(elapsed_time, [[new_confidence, VAD_THRESHOLD], [int(vad_triggered), VAD_THRESHOLD]])
            
            # Keep pre-buffering last 0.5 seconds of audio
            pre_buffer.append(audio_chunk)
            if len(pre_buffer) > int(BUFFER_BEFORE_SIZE / VAD_NUM_SAMPLES):
                pre_buffer.pop(0)

            # VAD detection logic
            if new_confidence > VAD_THRESHOLD:
                # Start capturing audio clip when threshold is crossed
                if not vad_triggered:
                    current_clip = pre_buffer.copy()  # Start with pre-buffer
                    vad_triggered = True

                current_clip.append(audio_chunk)  # Append current chunk

            elif vad_triggered:
                # If audio drops below threshold, capture post-buffer
                if len(post_buffer) < int(BUFFER_AFTER_SIZE / VAD_NUM_SAMPLES):
                    post_buffer.append(audio_chunk)
                    current_clip.append(audio_chunk)
                else:
                    # After capturing post-buffer, stop the clip
                    audio_clips.append(b''.join(current_clip))  # Store the full clip
                    vad_triggered = False  # Reset trigger
                    current_clip = []  # Clear current clip
                    post_buffer = []  # Clear post-buffer
        
        except KeyboardInterrupt as e:
            # if post_buffer is not empty, means program was interrupted while capturing audio for a clip
            if current_clip:
                audio_clips.append(b''.join(current_clip))
            
            print('Recorded clips:', len(audio_clips))
            break
            
        except Exception as e:
            print(e)
            break

    pp.finalize()
    stream.stop_stream()


In [19]:
audio_clips = []    # Reset the audio clips
start_recording()


The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File "code/__torch__/vad/model/vad_annotator.py", line 26, in forward
    if _2:
      _3 = torch.format(_0, (torch.size(x0))[-1])
      ops.prim.RaiseException(_3, "builtins.ValueError")
      ~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    else:
      pass

Traceback of TorchScript, original code (most recent call last):
  File "/home/keras/notebook/nvme1/adamnsandle/silero-models-research/vad/model/vad_annotator.py", line 480, in forward
        num_samples = 512 if sr == 16000 else 256
        if x.shape[-1] != num_samples:
            raise ValueError(f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    
        batch_size = x.shape[0]
builtins.ValueError: Provided number of samples is 513 (Supported values: 256 for 8000

## Show Recorded audio clips

In [13]:
def play_audio_clips(audio_clips, sample_rate=SAMPLE_RATE):
    for idx, clip in enumerate(audio_clips):
        print(f"Playing clip {idx+1}:")
        clip_int16 = np.frombuffer(clip, np.int16)
        clip_float32 = int2float(clip_int16)
        display(Audio(clip_float32, rate=sample_rate))

play_audio_clips(audio_clips)


Playing clip 1:


Playing clip 2:


Playing clip 3:


Playing clip 4:


Playing clip 5:


Playing clip 6:


Playing clip 7:
