In [None]:
import torch
import soundfile as sf
import torchaudio
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps

# --- 1. OVERRIDE THE READ FUNCTION ---
def read_audio_safe(path: str, target_sr: int = 16000):
    """
    Reads a WAV file using 'soundfile' (bypassing torchaudio backend issues)
    and converts it to a standardized Torch Tensor for VAD.
    """
    # Read directly with soundfile (No FFmpeg needed for WAV)
    data, samplerate = sf.read(path)

    # Convert to Tensor
    audio_tensor = torch.FloatTensor(data)

    # Handle Stereo (Convert to Mono if needed)
    # VAD models expect (1, N) or (N,)
    if len(audio_tensor.shape) > 1:
        # If shape is [Samples, Channels], average them to get Mono
        audio_tensor = audio_tensor.mean(dim=1) 
    
    # Add dimension if needed: [N] -> [1, N]
    if audio_tensor.ndim == 1:
        audio_tensor = audio_tensor.unsqueeze(0)

    # --- RESAMPLING ---
    # Silero VAD works best at 16000Hz. If the file is 44100Hz or 48000Hz, we must resample.
    if samplerate != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=samplerate, new_freq=target_sr)
        audio_tensor = resampler(audio_tensor)

    return audio_tensor

# --- 2. LOAD VAD MODEL ---
vad_model = load_silero_vad()

# --- 3. RUN ANALYSIS ---
test_audio_path = r"C:\Main Storage\Job\job_calvin\audio_sample_trim1.wav"

try:
    print(f"Reading: {test_audio_path}")
    
    # USE THE SAFE READ FUNCTION
    wav = read_audio_safe(test_audio_path)
    
    # Run VAD
    speech_timestamps = get_speech_timestamps(
        wav, 
        model, 
        threshold=0.5, 
        return_seconds=True
    )

    print("-" * 30)
    print(f"Success! Found {len(speech_timestamps)} speech segments.")
    
    # Invert logic to find silence (Your original logic)
    silence_segments = []
    current_time = 0.0
    
    # Calculate total duration for the final cut
    total_duration = wav.shape[1] / 16000 

    for speech in speech_timestamps:
        if speech['start'] > current_time:
            silence_segments.append([round(current_time, 2), round(speech['start'], 2)])
        current_time = speech['end']
        
    if current_time < total_duration:
        silence_segments.append([round(current_time, 2), round(total_duration, 2)])

    print("Silence Segments (Seconds):")
    print(silence_segments)

except Exception as e:
    print("Error:")
    print(e)

Reading: C:\Main Storage\Job\job_calvin\audio_sample_trim1.wav
------------------------------
Success! Found 6 speech segments.
Error:
module 'torchaudio' has no attribute 'info'
