In [1]:
import os
import torch
from vocos import Vocos
from IPython.display import Audio, display
from supervoice.audio import load_mono_audio, spectogram, melscale_fbanks
from supervoice.vocoder import BigVSAN
from utils.misc import dict_to_object
from librosa.filters import mel as librosa_mel_fn

In [2]:
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")

In [3]:
config = dict_to_object({
    "resblock": "1",
    "upsample_rates": [4,4,2,2,2,2],
    "upsample_kernel_sizes": [8,8,4,4,4,4],
    "upsample_initial_channel": 1536,
    "resblock_kernel_sizes": [3,7,11],
    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
    "activation": "snake",
    "snake_logscale": False,
    "num_mels": 100,
    "num_freq": 1025,
    "n_fft": 1024,
    "hop_size": 256,
    "win_size": 1024,

    "sampling_rate": 24000,
})
bigvsan = BigVSAN(config)
chk = torch.load("./pretrained/bigvsan.pt")
bigvsan.load_state_dict(chk['generator'])
bigvsan.eval()
bigvsan.remove_weight_norm()
def decode_bigvsan(src):
    with torch.no_grad():
        return bigvsan(src)

def bigvsan_spec(src):
    return spectogram(src, n_fft = 1024, n_mels = 100, n_hop = 256, n_window = 1024, mel_norm="slaney", mel_scale="slaney", sample_rate = 24000)



In [4]:
path = "./datasets/libritts-prepared/00000006/00000804"
src = torch.load(path + ".pt")
source = load_mono_audio(path + ".wav", 24000)
resynth = vocos.decode(src.unsqueeze(0))
resynch_2 = decode_bigvsan(src.unsqueeze(0)).squeeze(0)
print(resynch_2.shape)
display(Audio(data=source, rate=24000))
display(Audio(data=resynth, rate=24000))
display(Audio(data=resynch_2, rate=24000))

torch.Size([1, 116992])


In [5]:
print(chk['generator'].keys())
print(bigvsan.state_dict().keys())

odict_keys(['conv_pre.bias', 'conv_pre.weight_g', 'conv_pre.weight_v', 'ups.0.0.bias', 'ups.0.0.weight_g', 'ups.0.0.weight_v', 'ups.1.0.bias', 'ups.1.0.weight_g', 'ups.1.0.weight_v', 'ups.2.0.bias', 'ups.2.0.weight_g', 'ups.2.0.weight_v', 'ups.3.0.bias', 'ups.3.0.weight_g', 'ups.3.0.weight_v', 'ups.4.0.bias', 'ups.4.0.weight_g', 'ups.4.0.weight_v', 'ups.5.0.bias', 'ups.5.0.weight_g', 'ups.5.0.weight_v', 'resblocks.0.convs1.0.bias', 'resblocks.0.convs1.0.weight_g', 'resblocks.0.convs1.0.weight_v', 'resblocks.0.convs1.1.bias', 'resblocks.0.convs1.1.weight_g', 'resblocks.0.convs1.1.weight_v', 'resblocks.0.convs1.2.bias', 'resblocks.0.convs1.2.weight_g', 'resblocks.0.convs1.2.weight_v', 'resblocks.0.convs2.0.bias', 'resblocks.0.convs2.0.weight_g', 'resblocks.0.convs2.0.weight_v', 'resblocks.0.convs2.1.bias', 'resblocks.0.convs2.1.weight_g', 'resblocks.0.convs2.1.weight_v', 'resblocks.0.convs2.2.bias', 'resblocks.0.convs2.2.weight_g', 'resblocks.0.convs2.2.weight_v', 'resblocks.0.activation