In [1]:
import argparse
import json
import os
import numpy as np
import IPython.display as ipd
from tqdm import tqdm
from scipy.io.wavfile import write

import torch
use_gpu = torch.cuda.is_available()

import librosa
from librosa.core import load
from librosa.filters import mel as librosa_mel_fn
mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)

import params
from model import DiffVC

import sys
sys.path.append('hifi-gan/')
from env import AttrDict
from models import Generator as HiFiGAN

sys.path.append('speaker_encoder/')
from encoder import inference as spk_encoder
from pathlib import Path

In [2]:
def get_mel(wav_path):
    wav, _ = load(wav_path, sr=22050)
    wav = wav[:(wav.shape[0] // 256)*256]
    wav = np.pad(wav, 384, mode='reflect')
    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
    mel_spectrogram = np.matmul(mel_basis, stftm)
    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
    return log_mel_spectrogram

def get_embed(wav_path):
    wav_preprocessed = spk_encoder.preprocess_wav(wav_path)
    embed = spk_encoder.embed_utterance(wav_preprocessed)
    return embed

In [3]:
# loading voice conversion model
vc_path = 'checkpts/vc/vc_libritts_wodyn.pt' # path to voice conversion model

generator = DiffVC(params.n_mels, params.channels, params.filters, params.heads, 
                   params.layers, params.kernel, params.dropout, params.window_size, 
                   params.enc_dim, params.spk_dim, params.use_ref_t, params.dec_dim, 
                   params.beta_min, params.beta_max)
if use_gpu:
    generator = generator.cuda()
    generator.load_state_dict(torch.load(vc_path))
else:
    generator.load_state_dict(torch.load(vc_path, map_location='cpu'))
generator.eval()

print(f'Number of parameters: {generator.nparams}')

Number of parameters: 126259128


In [4]:
# loading HiFi-GAN vocoder
hfg_path = 'checkpts/vocoder/vctk/' # HiFi-GAN path

with open(hfg_path + 'config.json') as f:
    h = AttrDict(json.load(f))

if use_gpu:
    hifigan_universal = HiFiGAN(h).cuda()
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator')['generator'])
else:
    hifigan_universal = HiFiGAN(h)
    hifigan_universal.load_state_dict(torch.load(hfg_path + 'generator',  map_location='cpu')['generator'])

_ = hifigan_universal.eval()
hifigan_universal.remove_weight_norm()

Removing weight norm...


In [5]:
# loading speaker encoder
enc_model_fpath = Path('checkpts/spk_encoder/pretrained.pt') # speaker encoder path
if use_gpu:
    spk_encoder.load_model(enc_model_fpath, device="cuda")
else:
    spk_encoder.load_model(enc_model_fpath, device="cpu")

Loaded encoder "pretrained.pt" trained to step 1564501


In [6]:
# loading source and reference wavs, calculating mel-spectrograms and speaker embeddings
base_path = 'example/target_p225/sent1'
src_path = base_path + '_source.wav'
tgt_path = base_path + '_target.wav'

mel_source = torch.from_numpy(get_mel(src_path)).float().unsqueeze(0)
if use_gpu:
    mel_source = mel_source.cuda()
mel_source_lengths = torch.LongTensor([mel_source.shape[-1]])
if use_gpu:
    mel_source_lengths = mel_source_lengths.cuda()

mel_target = torch.from_numpy(get_mel(tgt_path)).float().unsqueeze(0)
if use_gpu:
    mel_target = mel_target.cuda()
mel_target_lengths = torch.LongTensor([mel_target.shape[-1]])
if use_gpu:
    mel_target_lengths = mel_target_lengths.cuda()

embed_source = torch.from_numpy(get_embed(src_path)).float().unsqueeze(0)
embed_target = torch.from_numpy(get_embed(tgt_path)).float().unsqueeze(0)
if use_gpu:
    embed_source = embed_source.cuda()
    embed_target = embed_target.cuda()

In [7]:
# performing voice conversion
mel_random = generator.convert(mel_source, mel_source_lengths, embed_source, 
                               mel_target, mel_target_lengths, embed_target, 
                               n_timesteps=50, use_ot=False)
mel_ot = generator.convert(mel_source, mel_source_lengths, embed_source, 
                           mel_target, mel_target_lengths, embed_target, 
                           n_timesteps=50, use_ot=True)

if use_gpu:
    mel_random = mel_random.cuda()
    mel_ot = mel_ot.cuda()

In [8]:
# source utterance (vocoded)
with torch.no_grad():
    audio_source = hifigan_universal.forward(mel_source).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio_source, rate=22050))

In [9]:
# reference utterance (vocoded)
with torch.no_grad():
    audio_target = hifigan_universal.forward(mel_target).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio_target, rate=22050))

In [10]:
# converted speech (random)
with torch.no_grad():
    audio_random = hifigan_universal.forward(mel_random).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio_random, rate=22050))

In [11]:
# converted speech (optimal transport)
with torch.no_grad():
    audio_ot = hifigan_universal.forward(mel_ot).cpu().squeeze().clamp(-1, 1)
ipd.display(ipd.Audio(audio_ot, rate=22050))