# Supervoice Vocoder evaluation

This notebook gives you an opportunity to test vocoder. This notebook is runnin on CPU just in case if your GPUs are busy training.

In [1]:
import torch
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as F
from IPython.display import Audio, display
import matplotlib.pyplot as plt

In [2]:
# Model loading
model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan', force_reload=True)

Downloading: "https://github.com/ex3ndr/supervoice-vocoder/zipball/master" to /home/steve/.cache/torch/hub/master.zip


## Load file
You can provide any reasonable audio file to check re-synthesing it from Mel Spectogram

In [3]:
from ipywidgets import FileUpload
upload = FileUpload(multiple=False)
upload

FileUpload(value=(), description='Upload')

In [4]:
def load_mono_audio(path):
    # Load audio
    audio, sr = torchaudio.load(path)

    # Resample
    if sr != 24000:
        audio = resampler(sr, 24000, device)(audio)
        sr = 24000

    # Convert to mono
    if audio.shape[0] > 1:
        audio = audio.mean(dim=0, keepdim=True)

    # Convert to single dimension
    audio = audio[0]

    return audio

# Load
if len(upload.value) == 1:
    with open("eval_vocoder.out", "w+b") as i:
        i.write(upload.value[0].content)
    source = load_mono_audio("eval_vocoder.out")
else:
    source = load_mono_audio("./sample.wav")

## Mel Spectogram
This code shows how to configure mel spectogram that is compatible with this vocoder

In [5]:
def spectogram(src):
    # Hann Window
    window = torch.hann_window(1024)

    # STFT
    stft = torch.stft(src, 
        n_fft = 1024, 
        hop_length = 256, 
        win_length = 1024,
        window = window, 
        center = True,
        return_complex = True
    )

    # magnitudes = stft[..., :-1].abs() ** 2 # Power
    magnitudes = stft[..., :-1].abs() # Amplitude (used by BigVSAN)

    # Mel Bank
    mel_filters = F.melscale_fbanks(
            n_freqs=int(1024 // 2 + 1),
            sample_rate=24000,
            f_min=0,
            f_max=12000,
            n_mels=100,
            norm="slaney",
            mel_scale="slaney"
    ).transpose(-1, -2)
    mel_spec = (mel_filters @ magnitudes)

    # Log
    log_spec = torch.clamp(mel_spec, min=1e-10).log()

    return log_spec

spec = spectogram(source)

## Resynthesing
This code synthesizes sound back, first audio is the source audio (rescaled to match vocoder parameters) and recreated audio

In [6]:
resynth = model.generate(spec)
display(Audio(data=source, rate=24000))
display(Audio(data=resynth, rate=24000))